Convert log file into json file using python - python

I am new to python. I am trying to convert the log file in to json file using python script. I created a main file and a del6 file. Here, it will convert the log file and write into a new json file. On execution, it shows me the following error.
Traceback (most recent call last):
File "main.py", line 23, in <module>
main()
File "main.py", line 14, in main
print toJson(sys.argv[2])
File "/home/paulsteven/BEAT/apache/del6.py", line 46, in toJson
entries = readfile(file)
File "/home/paulsteven/BEAT/apache/del6.py", line 21, in readfile
filecontent[index] = line2dict(line)
File "/home/paulsteven/BEAT/apache/del6.py", line 39, in line2dict
res = m.groupdict()
AttributeError: 'NoneType' object has no attribute 'groupdict'
I tried this link log to json
But it doesn't give me proper solution.
Is there any way to solve this.
Here is my sample log file:
February 14 2019, 15:38:47 172.217.160.132 www.google.com up tcp-tcp# www.google.com 172.217.160.132
February 14 2019, 15:38:47 104.28.4.86 www.smackcoders.com up tcp-tcp# www.smackcoders.com 104.28.4.86
The output should be like:
{"1": {"timestamp": "February 14 2019, 15:38:47", "monitorip": "172.217.160.132 ", "monitorhost": "www.google.com", "monitorstatus": "up", "monitorid": "tcp-tcp# www.google.com", "resolveip": "172.217.160.132"}, "2": {"timestamp": "February 14 2019, 15:38:47", "monitorip": "104.28.4.86", "monitorhost": "www.smackcoders.com", "monitorstatus": "up", "monitorid": "tcp-tcp# www.smackcoders.com", "resolveip": "104.28.4.86"}
Here is main python code:
import sys
from del6 import *
def main():
if len(sys.argv) < 3:
print "Incorrect Syntax. Usage: python main.py -f <filename>"
sys.exit(2)
elif sys.argv[1] != "-f":
print "Invalid switch '"+sys.argv[1]+"'"
sys.exit(2)
elif os.path.isfile(sys.argv[2]) == False:
print "File does not exist"
sys.exit(2)
print toJson(sys.argv[2])
text_file = open("tcp.json", "a+")
text_file.write(toJson(sys.argv[2]))
text_file.write("\n")
text_file.close()
if __name__ == "__main__":
main()
Here's my del6 code:
import fileinput
import re
import os
try: import simplejson as json
except ImportError: import json
#read input file and return entries' Dict Object
def readfile(file):
filecontent = {}
index = 0
#check necessary file size checking
statinfo = os.stat(file)
#just a guestimate. I believe a single entry contains atleast 150 chars
if statinfo.st_size < 150:
print "Not a valid access_log file. It does not have enough data"
else:
for line in fileinput.input(file):
index = index+1
if line != "\n": #don't read newlines
filecontent[index] = line2dict(line)
fileinput.close()
return filecontent
#gets a line of string from Log and convert it into Dict Object
def line2dict(line):
#Snippet, thanks to http://www.seehuhn.de/blog/52
parts = [
r'(?P<timestamp>\S+)',
r'(?P<monitorip>\S+)',
r'(?P<monitorhost>\S+)',
r'(?P<monitorstatus>\S+)',
r'"(?P<monitorid>\S+)"',
r'(?P<resolveip>\S+)',
]
pattern = re.compile(r'\s+'.join(parts)+r'\s*\Z')
m = pattern.match(line)
res = m.groupdict()
return res
#to get jSon of entire Log
#returns JSON object
def toJson(file):
#get dict object for each entry
entries = readfile(file)
return json.JSONEncoder().encode(entries)

I see that columns divided by double tab. So based on that:
i = 1
result = {}
with open('log.txt') as f:
lines = f.readlines()
for line in lines:
r = line.split('\t\t')
result[i] = {'timestamp': r[0], 'monitorip': r[1], 'monitorhost': r[2], 'monitorstatus': r[3], 'monitorid': r[4], 'resolveip': r[5]}
i += 1
Output:
{1: {'timestamp': 'February 14 2019, 15:38:47', 'monitorip': '172.217.160.132', 'monitorhost': 'www.google.com', 'monitorstatus': 'up', 'monitorid': 'tcp-tcp# www.google.com', 'resolveip': '172.217.160.132\n'}, 2: {'timestamp': 'February 14 2019, 15:38:47', 'monitorip': '104.28.4.86', 'monitorhost': 'www.smackcoders.com', 'monitorstatus': 'up', 'monitorid': 'tcp-tcp# www.smackcoders.com', 'resolveip': '104.28.4.86'}}
Or if you want to have list of dicts, which is more natural, then:
result = []
with open('log.txt') as f:
lines = f.readlines()
for line in lines:
r = line.split('\t\t')
result.append({'timestamp': r[0], 'monitorip': r[1], 'monitorhost': r[2], 'monitorstatus': r[3], 'monitorid': r[4], 'resolveip': r[5]})
Output:
[{'timestamp': 'February 14 2019, 15:38:47', 'monitorip': '172.217.160.132', 'monitorhost': 'www.google.com', 'monitorstatus': 'up', 'monitorid': 'tcp-tcp# www.google.com', 'resolveip': '172.217.160.132\n'}, {'timestamp': 'February 14 2019, 15:38:47', 'monitorip': '104.28.4.86', 'monitorhost': 'www.smackcoders.com', 'monitorstatus': 'up', 'monitorid': 'tcp-tcp# www.smackcoders.com', 'resolveip': '104.28.4.86'}]

Thanks for the answer.
To save it in a JSON file:
import json
i = 1
result = {}
with open('tcp.log') as f:
lines = f.readlines()
for line in lines:
r = line.split('\t\t')
result[i] = {'timestamp': r[0], 'monitorip': r[1], 'monitorhost': r[2], 'monitorstatus': r[3], 'monitorid': r[4], 'resolveip': r[5]}
i += 1
print(result)
with open('data.json', 'w') as fp:
json.dump(result, fp)

Below is a generic approach to the problem.The function 'log_lines_to_json' will handle any text file where the fields are separated by 'field_delimiter' and the field names are 'field_names'
FIELD_NAMES = ['timestamp', 'monitorip', 'monitorhost', 'monitorstatus', 'monitorid', 'resolveip']
FIELD_DELIMITER = '\t\t'
def log_lines_to_json(log_file, field_names, field_delimiter):
result = []
with open(log_file) as f:
lines = f.readlines()
for line in lines:
fields = line.split(field_delimiter)
result.append({field_name: fields[idx] for idx, field_name in enumerate(field_names)})
return result
entries = log_lines_to_json('log.txt', FIELD_NAMES, FIELD_DELIMITER)
for entry in entries:
print(entry)
Output:
{'monitorid': 'tcp-tcp# www.google.com', 'monitorstatus': 'up', 'timestamp': 'February 14 2019, 15:38:47', 'monitorhost': 'www.google.com', 'monitorip': '172.217.160.132', 'resolveip': '172.217.160.132\n'}
{'monitorid': 'tcp-tcp# www.smackcoders.com', 'monitorstatus': 'up', 'timestamp': 'February 14 2019, 15:38:47', 'monitorhost': 'www.smackcoders.com', 'monitorip': '104.28.4.86', 'resolveip': '104.28.4.86'}

Related

Looping over list in python and appending the result in a list

I have a code like below,
from io import StringIO
a = """ab: 01dvfgf
cd: 01fgvr windows
ab: 02hjuy linux
cd: 01erttt windows
lm: 02hjkkk"""
s = StringIO(a)
a_01 = []
a_02 =[]
zone = ['01', '02']
for elements in zone:
for line in s:
if line[4:6] == '01':
a_01.append(line)
elif line[4:6] == '02':
a_02.append(line)
print('a_01', *a_01, sep = "\n")
print('a_02', *a_02, sep = "\n")
In this code can I replace below 4 lines into two so that I don't have to write it again and again for different zones
if line[4:6] == '01':
a_01.append(line)
elif line[4:6] == '02':
a_02.append(line)
something like:
if line[4:6] == elements:
"a_" + elements.append(line)
from collections import defaultdict
...
s = StringIO(a)
zones = defaultdict(list)
for line in s:
zones[line[4:6]].append(line)
for zone, lines in zones.items(): # can be iterated just as a regular dict
print(zone, *lines, sep="\n")
Use a dict to store the values.
from io import StringIO
a = """ab: 01dvfgf
cd: 01fgvr windows
ab: 02hjuy linux
cd: 01erttt windows
lm: 02hjkkk"""
s = StringIO(a)
zone_dict = {'01': list(), '02': list()}
for line in s:
#print(line)
for key in zone_dict.keys():
#print(key)
if line[4:6] == key:
zone_dict[key].append(line)
zone_dict
Output:
{'01': ['ab: 01dvfgf\n', 'cd: 01fgvr windows\n', 'cd: 01erttt windows\n'],
'02': ['ab: 02hjuy linux\n', 'lm: 02hjkkk']}

Creating a dictionary from a text file Python

I have to create a dictionary based on a csv file that looks like this:
'song, 2000, 184950'
'boom, 2009, 83729'
'boom, 2010, 284500'
'boom, 2011, 203889'
'pow, 2000, 385920'
'pow, 2001, 248930'
from this, I have to create a dictionary that consists of the word as a key and then a list of class objects as a value.
This is what I have so far...
class Counter():
__slots__ = ('year', 'count')
_types = (int, int)
def readfile(file):
d = dict()
with open(file) as f:
for line in f:
element = line.split(,)
for word in element:
if word in d:
d[word].append([Count(int(element[1]), int(element[2]))])
else:
d[word] = [Count(int(element[1]), int(element[2]))]
print(d)
the output I'm getting is weird and it is giving me a dictionary similar to what mine should look like but it's using the counts (183930) as the key instead of the name. I also need it to add the class onto the value if it is already listed in the dictionary.
for example, since 'boom' should already be in the dictionary with {'boom' : Count(year = 2009, count = 83729)} I want there to be a list of those Count objects under the one value.
expected output:
{'song' : [Count(year= 2000, count= 184950)], 'boom' : [Count(year=2009, count=83729),
Count(year=2010, count= 284500), Count(year=2011, count=203889)], 'pow' : ...etc..}
With this loop:
for word in element:
if word in d:
d[word].append([Count(int(element[1]), int(element[2]))])
else:
d[word] = [Count(int(element[1]), int(element[2]))]
You are iterating trough all words on line, so you call (for first line):
d['song'].append([Count(int('2000'), int('184950'))])
d['2000'].append([Count(int('2000'), int('184950'))])
d['184950'].append([Count(int('2000'), int('184950'))])
Just use:
for line in f:
element = line.split(,)
word = element[0]
if word in d:
d[word].append(Count(int(element[1]), int(element[2])))
else:
d[word] = [Count(int(element[1]), int(element[2]))]
You also can replace your condition if word in d if you use collections.defaultdict:
import collections
def readfile(file):
d = collections.defaultdict(list)
with open(file) as f:
for line in f:
element = line.split(,)
word = element[0]
d[word].append(Count(int(element[1]), int(element[2])))
print(d)
Here's some simple code that does what you're looking for.
from collections import defaultdict
from pprint import pprint
class Counter(object):
__slots__ = ('year', 'count')
def __init__(self, year, count):
self.year = year
self.count = count
def __str__(self):
return "Counter(year=%d, count=%d)" % (self.year, self.count)
def __repr__(self):
return self.__str__()
def import_counts():
counts = defaultdict(list)
with file('data.csv') as f:
for line in f:
name, year, count = line.split(',')
name, year, count = name.strip(), int(year.strip()), int(count.strip())
counts[name].append(Counter(year, count))
return counts
pprint(import_counts())
However, I changed the data format into a proper CSV as below.
song, 2000, 184950
boom, 2009, 83729
boom, 2010, 284500
boom, 2011, 203889
pow, 2000, 385920
pow, 2001, 248930
The output generated is as below:
{
'boom': [
Counter(year=2009, count=83729),
Counter(year=2010, count=284500),
Counter(year=2011, count=203889)
],
'pow': [
Counter(year=2000, count=385920),
Counter(year=2001, count=248930)
],
'song': [
Counter(year=2000, count=184950)
]
}
Do note that the above doesn't validate input and would error if given an invalid CSV.

Dates to categories

I have an Excel spreadsheet I'm preparing to migrate to Access and the date column has entries in multiple formats such as: 1963 to 1969, Aug. 1968 to Sept. 1968, 1972, Mar-73, 24-Jul, Oct. 2, 1980, Aug 29, 1980, July 1946, etc. and 'undated'. I'm pulling the column that will be the key (map number) and date column into a csv and writing back to a csv.
I can strip out years that are 4 digit, but not ranges. And I'm stumped how to extract days and 2 digit years short of re-formatting by hand. My code isn't very elegant and probably not best practice:
import csv, xlwt, re
# create new Excel document and add sheet
# from tempfile import TemporaryFile
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"Year")
sheet1.write(0,1,"Map")
sheet1.write(0,2,"As Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
with open('C:\dateTestMSDOs.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = '0000'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# search and write instances of four consecutive digits
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"
I would like to write day and month to an additional column as well as pull the second 4 digit date of range entries.
You can try using dateutil for this. I think you'd still need to deal with some of the more difficult formats in a different way though. See a sample implementation below:
Code:
import dateutil.parser as dateparser
date_list = ['1963 to 1969',
'Aug. 1968 to Sept. 1968',
'Mar-73',
'24-Jul',
'Oct. 2 1980',
'Aug 29, 1980',
'July 1946',
'undated']
for d in date_list:
if 'to' in d:
a, b = d.split('to')
# Get the higher number. Use min to get lower of two.
print max(dateparser.parse(a.strip()).year, dateparser.parse(b.strip()).year)
elif d == 'undated':
print '0000'
else:
yr = dateparser.parse(d).year
print yr
Result:
1969
1968
1973
2014
1980
1980
1946
0000
[Finished in 0.4s]
Only glaring issue I can see is that 24-Jul returns a date of 2014 because the parser assumes the current day, month, or year in place of missing component, ie. Mar-73 will become 1973-03-20 if today is the 20th of the month, etc.
Not entirely sure if this is what you were going for or not but I just used a "simple" regex search and then traversed through the sets of groups that matched, applying the given function defined. If a match is found then the function that is called (found in the regex_groups variable) should return a dictionary with the following keys: start_day, start_month, start_year, end_day, end_month, end_year
Then you can do whatever you'd like with those values. Definitely not the cleanest solution but it works, as far as I can tell.
#!/usr/local/bin/python2.7
import re
# Crazy regex
regex_pattern = '(?:(\d{4}) to (\d{4}))|(?:(\w+)\. (\d{4}) to (\w+)\. (\d{4}))|(?:(\w+)-(\d{2}))|(?:(\d{2})-(\w+))|(?:(\w+)\. (\d+), (\d{4}))|(?:(\w+) (\d+), (\d{4}))|(?:(\w+) (\d{4}))|(?:(\d{4}))'
date_strings = [
'1963 to 1969',
'Aug. 1968 to Sept. 1968',
'1972',
'Mar-73',
'24-Jul',
'Oct. 2, 1980',
'Aug 29, 1980',
'July 1946',
]
# Here you set the group matching functions that will be called for a matching group
regex_groups = {
(1,2): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': group_matches[1]
},
(3,4,5,6): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': group_matches[2], 'end_year': group_matches[3]
},
(7,8): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(9,10): lambda group_matches: {
'start_day': group_matches[1], 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
(11,12,13): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(14,15,16): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(17,18): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(19,): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
}
for ds in date_strings:
matches = re.search(regex_pattern, ds)
start_month = ''
start_year = ''
end_month = ''
end_year = ''
for regex_group, group_func in regex_groups.items():
group_matches = [matches.group(sub_group_num) for sub_group_num in regex_group]
if all(group_matches):
match_data = group_func(group_matches)
print
print 'Matched:', ds
print '%s to %s' % ('-'.join([match_data['start_day'], match_data['start_month'], match_data['start_year']]), '-'.join([match_data['end_day'], match_data['end_month'], match_data['end_year']]))
# match_data is a dictionary with keys:
# * start_day
# * start_month
# * start_year
# * end_day
# * end_month
# * end_year
# If a group doesn't contain one of those items, then it is set to a blank string
Outputs:
Matched: 1963 to 1969
--1963 to --1969
Matched: Aug. 1968 to Sept. 1968
-Aug-1968 to -Sept-1968
Matched: 1972
--1972 to --
Matched: Mar-73
-Mar-73 to --
Matched: 24-Jul
Jul--24 to --
Matched: Oct. 2, 1980
2-Oct-1980 to --
Matched: Aug 29, 1980
29-Aug-1980 to --
Matched: July 1946
-July-1946 to --
You could define all the possible cases of dates using regex, something like:
import re
s = ['1963 to 1969', 'Aug. 1968 to Sept. 1968',
'1972', 'Mar-73', '03-Jun', '24-Jul', 'Oct. 2, 1980', 'Oct. 26, 1980',
'Aug 29 1980', 'July 1946']
def get_year(date):
mm = re.findall("\d{4}", date)
if mm:
return mm
mm = re.search("\w+-(\d{2})", date)
if mm:
return [mm.group(1)]
def get_month(date):
mm = re.findall("[A-Z][a-z]+", date)
if mm:
return mm
def get_day(date):
d_expr = ["(\d|\d{2})\-[A-Z][a-z]+","[A-Z][a-z]+[\. ]+(\d|\d{2}),"]
for expr in d_expr:
mm = re.search(expr, date)
if mm:
return [mm.group(1)]
d = {}
m = {}
y = {}
for idx, date in enumerate(s):
d[idx] = get_day(date)
m[idx] = get_month(date)
y[idx] = get_year(date)
print "Year Dict: ", y
print "Month Dict: ", m
print "Day Dict: ", d
As result you get dictionaries of days, month, and years. They could be used to populate the rows.
Output:
Year Dict: {0: ['1963', '1969'], 1: ['1968', '1968'], 2: ['1972'], 3: ['73'], 4: None, 5: None, 6: ['1980'], 7: ['1980'], 8: ['1980'], 9: ['1946']}
Month Dict: {0: None, 1: ['Aug', 'Sept'], 2: None, 3: ['Mar'], 4: ['Jun'], 5: ['Jul'], 6: ['Oct'], 7: ['Oct'], 8: ['Aug'], 9: ['July']}
Day Dict: {0: None, 1: None, 2: None, 3: None, 4: ['03'], 5: ['24'], 6: ['2'], 7: ['26'], 8: None, 9: None}
Thank you for the innovative suggestions. After consideration we decided to remove day and month from what would be searchable in our database, since only a relatively small amount of our data had that level of detail. Here is the code I use to extract and generate the data I needed from a long and messy list.
import csv, xlwt, re
# create new Excel document and add sheet
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"MapYear_(Parsed)")
sheet1.write(0,1,"Map_Number")
sheet1.write(0,2,"As_Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
yearStr = ''
with open('C:\mapsDateFix.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = 'undated'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
if yearStr != 'undated':
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
# search and write instances of four consecutive digits
if yearStr != dateRaw:
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"

How to store CSV data in a Nested Dictionary that has a dictionary and a list?

I have the following CSV Data,
Rule1,Status1,1
Rule1,Status2,1
Rule1,Status3,1
Rule1,Status4,2
Rule2,Status1,2
Rule2,Status2,1
Rule2,Status3,1
Rule2,Status4,3
I have unique rules (first column) stored in a list called Rules. I want my dictionary to look like the following:
DictionaryFull = {
'Rule1' : {1 : [Status1, Status2, Status3], 2 : [Status4]},
'Rule2' : {1 : [Status2, Status3], 2 : [Status1], 3 : [Status4]}
}
Here is what I tried:
openfile = ('data.csv', 'rU')
finalfile = csv.reader(openfile, delimiter=',')
FullDictionary = {}
for row in finalfile:
for j in range (0, 300): #300 number of rules
if Rules[j] not in FullDictionary:
for i in range(1, 71): #These are third column numbers 1 - 71
if i == int(row[2]) and row[0] == Rules[j]:
FullDictionary = {Rules[j] : { i : [].append[row[1]}}
print FullDictionary
But I am getting the following as the result:
{'Rule1': {1 : None}} and so on
Am I doing something wrong? How to accomplish this task of having a dictionary with both another dictionary and a list.
I tried this:
def something():
full_dictionary = {}
with open(DataFilePath) as f:
reader = csv.reader(f)
for row in reader:
rule = row[2], status = row[0], num = int(row[5])
r = full_dictionary.setdefault(rule, {})
r.setdefault(num, []).append(status)
print full_dictionary
The error: ValueError: I/O operation on closed file
Hwo about using collection.defaultdict:
import csv
from collections import defaultdict
full_dictionary = defaultdict(lambda: defaultdict(list))
with open('data.csv') as f:
reader = csv.reader(f)
for rule, status, num in reader:
full_dictionary[rule][num].append(status)
print full_dictionary
output:
defaultdict(<function <lambda> at 0x00000000025A6438>, {
'Rule2': defaultdict(<type 'list'>, {
'1': ['Status2', 'Status3'],
'3': ['Status4'],
'2': ['Status1']
}),
'Rule1': defaultdict(<type 'list'>, {
'1': ['Status1', 'Status2', 'Status3'],
'2': ['Status4']
})
})
If you don't want to use defaultdict, you have to care new key.
For example, using dict.setdefault:
import csv
full_dictionary = {}
with open('data.csv') as f:
reader = csv.reader(f)
for rule, status, num in reader:
r = full_dictionary.setdefault(rule, {})
r.setdefault(num, []).append(status)
print full_dictionary
output:
{'Rule1': {'1': ['Status1', 'Status2', 'Status3'], '2': ['Status4']},
'Rule2': {'1': ['Status2', 'Status3'], '2': ['Status1'], '3': ['Status4']}}
list.append returns None, so your assignment Rules[j] = [].append([row[1]) is setting Rules[j] = None.
Amend that to:
FullDictionary = {Rules[j] : { i : [row[1]}}
or
old_value = Rules[j].get(i, [])
old_value.append(row[1])
depending on what you're wishing to achieve.

How to merge only the unique lines from file_a to file_b?

This question has been asked here in one form or another but not quite the thing I'm looking for. So, this is the situation I shall be having: I already have one file, named file_a and I'm creating another file - file_b. file_a is always bigger than file_b in size. There will be a number of duplicate lines in file_b (hence, in file_a as well) but both the files will have some unique lines. What I want to do is: to copy/merge only the unique lines from file_a to file_b and then sort the line order, so that the file_b becomes the most up-to-date one with all the unique entries. Either of the original files shouldn't be more than 10MB in size. What's the most efficient (and fastest) way I can do that?
I was thinking something like that, which does the merging alright.
#!/usr/bin/env python
import os, time, sys
# Convert Date/time to epoch
def toEpoch(dt):
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
# input files
o_file = "file_a"
c_file = "file_b"
n_file = [o_file,c_file]
m_file = "merged.file"
for x in range(len(n_file)):
P = open(n_file[x],"r")
output = P.readlines()
P.close()
# Sort the output, order by 2nd last field
#sp_lines = [ line.split('\t') for line in output ]
#sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )
F = open(m_file,'w')
#for line in sp_lines:
for line in output:
if "group_" in line:
F.write(line)
F.close()
But, it's:
not with only the unique lines
not sorted (by next to last field)
and introduces the 3rd file i.e. m_file
Just a side note (long story short): I can't use sorted() here as I'm using v2.3, unfortunately. The input files look like this:
On 23/03/11 00:40:03
JobID Group.User Ctime Wtime Status QDate CDate
===================================================================================
430792 group_atlas.pltatl16 0 32 4 02/03/11 21:52:38 02/03/11 22:02:15
430793 group_atlas.atlas084 30 472 4 02/03/11 21:57:43 02/03/11 22:09:35
430794 group_atlas.atlas084 12 181 4 02/03/11 22:02:37 02/03/11 22:05:42
430796 group_atlas.atlas084 8 185 4 02/03/11 22:02:38 02/03/11 22:05:46
I tried to use cmp() to sort by the 2nd last field but, I think, it doesn't work just because of the first 3 lines of the input files.
Can anyone please help? Cheers!!!
Update 1:
For the future reference, as suggested by Jakob, here is the complete script. It worked just fine.
#!/usr/bin/env python
import os, time, sys
from sets import Set as set
def toEpoch(dt):
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
def yield_lines(fileobj):
#I want to discard the headers
for i in xrange(3):
fileobj.readline()
#
for line in fileobj:
yield line
def app(path1, path2):
file1 = set(yield_lines(open(path1)))
file2 = set(yield_lines(open(path2)))
return file1.union(file2)
# Input files
o_file = "testScript/03"
c_file = "03.bak"
m_file = "finished.file"
print time.strftime('%H:%M:%S', time.localtime())
# Sorting the output, order by 2nd last field
sp_lines = [ line.split('\t') for line in app(o_file, c_file) ]
sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )
F = open(m_file,'w')
print "No. of lines: ",len(sp_lines)
for line in sp_lines:
MF = '\t'.join(line)
F.write(MF)
F.close()
It took about 2m:47s to finish for 145244 lines.
[testac1#serv07 ~]$ ./uniq-merge.py
17:19:21
No. of lines: 145244
17:22:08
thanks!!
Update 2:
Hi eyquem, this is the Error message I get when I run your script(s).
From the first script:
[testac1#serv07 ~]$ ./uniq-merge_2.py
File "./uniq-merge_2.py", line 44
fm.writelines( '\n'.join(v)+'\n' for k,v in output )
^
SyntaxError: invalid syntax
From the second script:
[testac1#serv07 ~]$ ./uniq-merge_3.py
File "./uniq-merge_3.py", line 24
output = sett(line.rstrip() for line in fa)
^
SyntaxError: invalid syntax
Cheers!!
Update 3:
The previous one wasn't sorting the list at all. Thanks to eyquem to pointing that out. Well, it does now. This is a further modification to Jakob's version - I converted the set:app(path1, path2) to a list:myList() and then applied the sort( lambda ... ) to the myList to sort the merged file by the nest to last field. This is the final script.
#!/usr/bin/env python
import os, time, sys
from sets import Set as set
def toEpoch(dt):
# Convert date/time to epoch
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
def yield_lines(fileobj):
# Discard the headers (1st 3 lines)
for i in xrange(3):
fileobj.readline()
for line in fileobj:
yield line
def app(path1, path2):
# Remove duplicate lines
file1 = set(yield_lines(open(path1)))
file2 = set(yield_lines(open(path2)))
return file1.union(file2)
print time.strftime('%H:%M:%S', time.localtime())
# I/O files
o_file = "testScript/03"
c_file = "03.bak"
m_file = "finished.file"
# Convert set into to list
myList = list(app(o_file, c_file))
# Sort the list by the date
sp_lines = [ line.split('\t') for line in myList ]
sp_lines.sort( lambda a, b: cmp(toEpoch(a[-2]),toEpoch(b[-2])) )
F = open(m_file,'w')
print "No. of lines: ",len(sp_lines)
# Finally write to the outFile
for line in sp_lines:
MF = '\t'.join(line)
F.write(MF)
F.close()
There is no speed boost at all, it took 2m:50s to process the same 145244 lines. Is anyone see any scope of improvement, please let me know. Thanks to Jakob and eyquem for their time. Cheers!!
Update 4:
Just for future reference, this is a modified version of eyguem, which works much better and faster then the previous ones.
#!/usr/bin/env python
import os, sys, re
from sets import Set as sett
from time import mktime, strptime, strftime
def sorting_merge(o_file, c_file, m_file ):
# RegEx for Date/time filed
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d')
def kl(lines,pat = pat):
# match only the next to last field
line = lines.split('\t')
line = line[-2]
return mktime(strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
output = sett()
head = []
# Separate the header & remove the duplicates
def rmHead(f_n):
f_n.readline()
for line1 in f_n:
if pat.search(line1): break
else: head.append(line1) # line of the header
for line in f_n:
output.add(line.rstrip())
output.add(line1.rstrip())
f_n.close()
fa = open(o_file, 'r')
rmHead(fa)
fb = open(c_file, 'r')
rmHead(fb)
# Sorting date-wise
output = [ (kl(line),line.rstrip()) for line in output if line.rstrip() ]
output.sort()
fm = open(m_file,'w')
# Write to the file & add the header
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head[0]+head[1])))
for t,line in output:
fm.write(line + '\n')
fm.close()
c_f = "03_a"
o_f = "03_b"
sorting_merge(o_f, c_f, 'outfile.txt')
This version is much faster - 6.99 sec. for 145244 lines compare to the 2m:47s - then the previous one using lambda a, b: cmp(). Thanks to eyquem for all his support. Cheers!!
EDIT 2
My previous codes have problems with output = sett(line.rstrip() for line in fa) and output.sort(key=kl)
Moreover, they have some complications.
So I examined the choice of reading the files directly with a set() function taken by Jakob Bowyer in his code.
Congratulations Jakob ! (and Michal Chruszcz by the way) : set() is unbeatable, it's faster than a reading one line at a time.
Then , I abandonned my idea to read the files line after line.
.
But I kept my idea to avoid a sorting with the help of cmp() function because, as it is described in the doc:
s.sort([cmpfunc=None])
The sort() method takes an optional
argument specifying a comparison
function of two arguments (list items)
(...) Note that this slows the sorting
process down considerably
http://docs.python.org/release/2.3/lib/typesseq-mutable.html
Then, I managed to obtain a list of tuples (t,line) in which the t is
time.mktime(time.strptime(( 1st date-and-hour in line ,'%d/%m/%y %H:%M:%S'))
by the instruction
output = [ (kl(line),line.rstrip()) for line in output]
.
I tested 2 codes. The following one in which 1st date-and-hour in line is computed thanks to a regex:
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
output = [ (kl(line),line.rstrip()) for line in output if line.rstrip()]
output.sort()
And a second code in which kl() is:
def kl(line,pat = pat):
return time.mktime(time.strptime(line.split('\t')[-2],'%d/%m/%y %H:%M:%S'))
.
The results are
Times of execution:
0.03598 seconds for the first code with regex
0.03580 seconds for the second code with split('\t')
that is to say the same
This algorithm is faster than a code using a function cmp() :
a code in which the set of lines output isn't transformed in a list of tuples by
output = [ (kl(line),line.rstrip()) for line in output]
but is only transformed in a list of the lines (without duplicates, then) and sorted with a function mycmp() (see the doc):
def mycmp(a,b):
return cmp(time.mktime(time.strptime(a.split('\t')[-2],'%d/%m/%y %H:%M:%S')),
time.mktime(time.strptime(b.split('\t')[-2],'%d/%m/%y %H:%M:%S')))
output = [ line.rstrip() for line in output] # not list(output) , to avoid the problem of newline of the last line of each file
output.sort(mycmp)
for line in output:
fm.write(line+'\n')
has an execution time of
0.11574 seconds
.
The code:
#!/usr/bin/env python
import os, time, sys, re
from sets import Set as sett
def sorting_merge(o_file , c_file, m_file ):
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)')
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
output = sett()
head = []
fa = open(o_file)
fa.readline() # first line is skipped
while True:
line1 = fa.readline()
mat1 = pat.search(line1)
if not mat1: head.append(line1) # line1 is here a line of the header
else: break # the loop ends on the first line1 not being a line of the heading
output = sett( fa )
fa.close()
fb = open(c_file)
while True:
line1 = fb.readline()
if pat.search(line1): break
output = output.union(sett( fb ))
fb.close()
output = [ (kl(line),line.rstrip()) for line in output]
output.sort()
fm = open(m_file,'w')
fm.write(time.strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
te = time.clock()
sorting_merge('ytre.txt','tataye.txt','merged.file.txt')
print time.clock()-te
This time, I hope it will run correctly, and that the only thing to do is to wait the times of execution on real files much bigger than the ones on which I tested the codes
.
EDIT 3
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+'
'[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'|'
'[ \t]+aborted/deleted)')
.
EDIT 4
#!/usr/bin/env python
import os, time, sys, re
from sets import Set
def sorting_merge(o_file , c_file, m_file ):
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+'
'[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'|'
'[ \t]+aborted/deleted)')
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
head = []
output = Set()
fa = open(o_file)
fa.readline() # first line is skipped
for line1 in fa:
if pat.search(line1): break # first line after the heading
else: head.append(line1) # line of the header
for line in fa:
output.add(line.rstrip())
output.add(line1.rstrip())
fa.close()
fb = open(c_file)
for line1 in fb:
if pat.search(line1): break
for line in fb:
output.add(line.rstrip())
output.add(line1.rstrip())
fb.close()
if '' in output: output.remove('')
output = [ (kl(line),line) for line in output]
output.sort()
fm = open(m_file,'w')
fm.write(time.strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line+'\n')
fm.close()
te = time.clock()
sorting_merge('A.txt','B.txt','C.txt')
print time.clock()-te
Maybe something along these lines?
from sets import Set as set
def yield_lines(fileobj):
#I want to discard the headers
for i in xrange(3):
fileobj.readline()
for line in fileobj:
yield line
def app(path1, path2):
file1 = set(yield_lines(open(path1)))
file2 = set(yield_lines(open(path2)))
return file1.union(file2)
EDIT: Forgot about with :$
I wrote this new code, with the ease of using a set. It is faster that my previous code. And, it seems, than your code
#!/usr/bin/env python
import os, time, sys, re
from sets import Set as sett
def sorting_merge(o_file , c_file, m_file ):
# Convert Date/time to epoch
def toEpoch(dt):
dt_ptrn = '%d/%m/%y %H:%M:%S'
return int(time.mktime(time.strptime(dt, dt_ptrn)))
pat = re.compile('([0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)'
'[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d')
fa = open(o_file)
head = []
fa.readline()
while True:
line1 = fa.readline()
mat1 = pat.search(line1)
if not mat1:
head.append(('',line1.rstrip()))
else:
break
output = sett((toEpoch(pat.search(line).group(1)) , line.rstrip())
for line in fa)
output.add((toEpoch(mat1.group(1)) , line1.rstrip()))
fa.close()
fb = open(c_file)
while True:
line1 = fb.readline()
mat1 = pat.search(line1)
if mat1: break
for line in fb:
output.add((toEpoch(pat.search(line).group(1)) , line.rstrip()))
output.add((toEpoch(mat1.group(1)) , line1.rstrip()))
fb.close()
output = list(output)
output.sort()
output[0:0] = head
output[0:0] = [('',time.strftime('On %d/%m/%y %H:%M:%S'))]
fm = open(m_file,'w')
fm.writelines( line+'\n' for t,line in output)
fm.close()
te = time.clock()
sorting_merge('ytr.txt','tatay.txt','merged.file.txt')
print time.clock()-te
Note that this code put a heading in the merged file
.
EDIT
Aaaaaah... I got it... :-))
Execution's time divided by 3 !
#!/usr/bin/env python
import os, time, sys, re
from sets import Set as sett
def sorting_merge(o_file , c_file, m_file ):
pat = re.compile('[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d'
'(?=[ \t]+[0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)')
def kl(line,pat = pat):
return time.mktime(time.strptime((pat.search(line).group()),'%d/%m/%y %H:%M:%S'))
fa = open(o_file)
head = []
fa.readline()
while True:
line1 = fa.readline()
mat1 = pat.search(line1)
if not mat1:
head.append(line1.rstrip())
else:
break
output = sett(line.rstrip() for line in fa)
output.add(line1.rstrip())
fa.close()
fb = open(c_file)
while True:
line1 = fb.readline()
mat1 = pat.search(line1)
if mat1: break
for line in fb:
output.add(line.rstrip())
output.add(line1.rstrip())
fb.close()
output = list(output)
output.sort(key=kl)
output[0:0] = [time.strftime('On %d/%m/%y %H:%M:%S')] + head
fm = open(m_file,'w')
fm.writelines( line+'\n' for line in output)
fm.close()
te = time.clock()
sorting_merge('ytre.txt','tataye.txt','merged.file.txt')
print time.clock()-te
Last codes, I hope.
Because I found a killer code.
First , I created two files "xxA.txt" and "yyB.txt" of 30 lines having 30000 lines as
430559 group_atlas.atlas084 12 181 4 04/03/10 01:38:02 02/03/11 22:05:42
430502 group_atlas.atlas084 12 181 4 23/01/10 21:45:05 02/03/11 22:05:42
430544 group_atlas.atlas084 12 181 4 17/06/11 12:58:10 02/03/11 22:05:42
430566 group_atlas.atlas084 12 181 4 25/03/10 23:55:22 02/03/11 22:05:42
with the following code:
create AB.py
from random import choice
n = tuple( str(x) for x in xrange(500,600))
days = ('01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16',
'17','18','19','20','21','22','23','24','25','26','27','28')
# not '29','30,'31' to avoid problems with strptime() on last days of february
months = days[0:12]
hours = days[0:23]
ms = ['00','01','02','03','04','05','06','07','09'] + [str(x) for x in xrange(10,60)]
repeat = 30000
with open('xxA.txt','w') as f:
# 430794 group_atlas.atlas084 12 181 4 02/03/11 22:02:37 02/03/11 22:05:42
ch = ('On 23/03/11 00:40:03\n'
'JobID Group.User Ctime Wtime Status QDate CDate\n'
'===================================================================================\n')
f.write(ch)
for i in xrange(repeat):
line = '430%s group_atlas.atlas084 12 181 4 \t%s/%s/%s %s:%s:%s\t02/03/11 22:05:42\n' %\
(choice(n),
choice(days),choice(months),choice(('10','11')),
choice(hours),choice(ms),choice(ms))
f.write(line)
with open('yyB.txt','w') as f:
# 430794 group_atlas.atlas084 12 181 4 02/03/11 22:02:37 02/03/11 22:05:42
ch = ('On 25/03/11 13:45:24\n'
'JobID Group.User Ctime Wtime Status QDate CDate\n'
'===================================================================================\n')
f.write(ch)
for i in xrange(repeat):
line = '430%s group_atlas.atlas084 12 181 4 \t%s/%s/%s %s:%s:%s\t02/03/11 22:05:42\n' %\
(choice(n),
choice(days),choice(months),choice(('10','11')),
choice(hours),choice(ms),choice(ms))
f.write(line)
with open('xxA.txt') as g:
print 'readlines of xxA.txt :',len(g.readlines())
g.seek(0,0)
print 'set of xxA.txt :',len(set(g))
with open('yyB.txt') as g:
print 'readlines of yyB.txt :',len(g.readlines())
g.seek(0,0)
print 'set of yyB.txt :',len(set(g))
Then I ran these 3 programs:
"merging regex.py"
#!/usr/bin/env python
from time import clock,mktime,strptime,strftime
from sets import Set
import re
infunc = []
def sorting_merge(o_file, c_file, m_file ):
infunc.append(clock()) #infunc[0]
pat = re.compile('([0123]\d/[01]\d/\d{2} [012]\d:[0-6]\d:[0-6]\d)')
output = Set()
def rmHead(filename, a_set):
f_n = open(filename, 'r')
f_n.readline()
head = []
for line in f_n:
head.append(line) # line of the header
if line.strip('= \r\n')=='': break
for line in f_n:
a_set.add(line.rstrip())
f_n.close()
return head
infunc.append(clock()) #infunc[1]
head = rmHead(o_file, output)
infunc.append(clock()) #infunc[2]
head = rmHead(c_file, output)
infunc.append(clock()) #infunc[3]
if '' in output: output.remove('')
infunc.append(clock()) #infunc[4]
output = [ (mktime(strptime(pat.search(line).group(),'%d/%m/%y %H:%M:%S')),line)
for line in output ]
infunc.append(clock()) #infunc[5]
output.sort()
infunc.append(clock()) #infunc[6]
fm = open(m_file,'w')
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
infunc.append(clock()) #infunc[7]
c_f = "xxA.txt"
o_f = "yyB.txt"
t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergedr.txt')
t2 = clock()
print 'merging regex'
print 'total time of execution :',t2-t1
print ' launching :',infunc[1] - t1
print ' preparation :',infunc[1] - infunc[0]
print ' reading of 1st file :',infunc[2] - infunc[1]
print ' reading of 2nd file :',infunc[3] - infunc[2]
print ' output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print ' sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]
"merging split.py"
#!/usr/bin/env python
from time import clock,mktime,strptime,strftime
from sets import Set
infunc = []
def sorting_merge(o_file, c_file, m_file ):
infunc.append(clock()) #infunc[0]
output = Set()
def rmHead(filename, a_set):
f_n = open(filename, 'r')
f_n.readline()
head = []
for line in f_n:
head.append(line) # line of the header
if line.strip('= \r\n')=='': break
for line in f_n:
a_set.add(line.rstrip())
f_n.close()
return head
infunc.append(clock()) #infunc[1]
head = rmHead(o_file, output)
infunc.append(clock()) #infunc[2]
head = rmHead(c_file, output)
infunc.append(clock()) #infunc[3]
if '' in output: output.remove('')
infunc.append(clock()) #infunc[4]
output = [ (mktime(strptime(line.split('\t')[-2],'%d/%m/%y %H:%M:%S')),line)
for line in output ]
infunc.append(clock()) #infunc[5]
output.sort()
infunc.append(clock()) #infunc[6]
fm = open(m_file,'w')
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
infunc.append(clock()) #infunc[7]
c_f = "xxA.txt"
o_f = "yyB.txt"
t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergeds.txt')
t2 = clock()
print 'merging split'
print 'total time of execution :',t2-t1
print ' launching :',infunc[1] - t1
print ' preparation :',infunc[1] - infunc[0]
print ' reading of 1st file :',infunc[2] - infunc[1]
print ' reading of 2nd file :',infunc[3] - infunc[2]
print ' output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print ' sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]
"merging killer"
#!/usr/bin/env python
from time import clock,strftime
from sets import Set
import re
infunc = []
def sorting_merge(o_file, c_file, m_file ):
infunc.append(clock()) #infunc[0]
patk = re.compile('([0123]\d)/([01]\d)/(\d{2}) ([012]\d:[0-6]\d:[0-6]\d)')
output = Set()
def rmHead(filename, a_set):
f_n = open(filename, 'r')
f_n.readline()
head = []
for line in f_n:
head.append(line) # line of the header
if line.strip('= \r\n')=='': break
for line in f_n:
a_set.add(line.rstrip())
f_n.close()
return head
infunc.append(clock()) #infunc[1]
head = rmHead(o_file, output)
infunc.append(clock()) #infunc[2]
head = rmHead(c_file, output)
infunc.append(clock()) #infunc[3]
if '' in output: output.remove('')
infunc.append(clock()) #infunc[4]
output = [ (patk.search(line).group(3,2,1,4),line)for line in output ]
infunc.append(clock()) #infunc[5]
output.sort()
infunc.append(clock()) #infunc[6]
fm = open(m_file,'w')
fm.write(strftime('On %d/%m/%y %H:%M:%S\n')+(''.join(head)))
for t,line in output:
fm.write(line + '\n')
fm.close()
infunc.append(clock()) #infunc[7]
c_f = "xxA.txt"
o_f = "yyB.txt"
t1 = clock()
sorting_merge(o_f, c_f, 'zz_mergedk.txt')
t2 = clock()
print 'merging killer'
print 'total time of execution :',t2-t1
print ' launching :',infunc[1] - t1
print ' preparation :',infunc[1] - infunc[0]
print ' reading of 1st file :',infunc[2] - infunc[1]
print ' reading of 2nd file :',infunc[3] - infunc[2]
print ' output.remove(\'\') :',infunc[4] - infunc[3]
print 'creation of list output :',infunc[5] - infunc[4]
print ' sorting of output :',infunc[6] - infunc[5]
print 'writing of merging file :',infunc[7] - infunc[6]
print 'closing of the function :',t2-infunc[7]
results
merging regex
total time of execution : 14.2816595405
launching : 0.00169211450059
preparation : 0.00168093989599
reading of 1st file : 0.163582242995
reading of 2nd file : 0.141301478261
output.remove('') : 2.37460347614e-05
creation of output : 13.4460212122
sorting of output : 0.216363532237
writing of merging file : 0.232923737514
closing of the function : 0.0797514767938
merging split
total time of execution : 13.7824474898
launching : 4.10666718815e-05
preparation : 2.70984161395e-05
reading of 1st file : 0.154349784679
reading of 2nd file : 0.136050810927
output.remove('') : 2.06730184981e-05
creation of output : 12.9691854691
sorting of output : 0.218704332534
writing of merging file : 0.225259076223
closing of the function : 0.0788362766776
merging killer
total time of execution : 2.14315311024
launching : 0.00206199391263
preparation : 0.00205026057781
reading of 1st file : 0.158711791582
reading of 2nd file : 0.138976601775
output.remove('') : 2.37460347614e-05
creation of output : 0.621466415424
sorting of output : 0.823161602941
writing of merging file : 0.227701565422
closing of the function : 0.171049393149
During killer program, sorting output takes 4 times longer , but time of creation of output as a list is divided by 21 !
Then globaly, the execution's time is reduced at least by 85 %.

Categories

Resources