How to parse Log file to object list - python

I'm working with data tipe Log (ROS).
Multiple objects are saved in Log file like this:
header:
seq: 2
stamp:
secs: 1596526199
nsecs: 140017032
frame_id: ''
level: 2
name: "/replicator_node"
msg: "Replicator node dumping to /tmp/replicator_dumps"
file: "replicator_node.py"
function: "__init__"
line: 218
topics: [/move_mongodb_entries/status, /move_mongodb_entries/goal, /move_mongodb_entries/result,
/move_mongodb_entries/cancel, /rosout, /move_mongodb_entries/feedback]
header:
seq: 2
stamp:
secs: 1596526198
nsecs: 848793029
frame_id: ''
level: 2
name: "/mongo_server"
msg: "2020-08-04T09:29:58.848+0200 [initandlisten] connection accepted from 127.0.0.1:58672\
\ #1 (1 connection now open)"
file: "mongodb_server.py"
function: "_mongo_loop"
line: 139
topics: [/rosout]
As you can see not everything is in same line as it's name.
I want to pars it to have object list - so I could access it like that:
object[1].msg would give me:
"2020-08-04T09:29:58.848+0200 [initandlisten] connection accepted from 127.0.0.1:58672 #1 (1 connection now open)"
Also, sometimes file name is something like: \home\nfoo\foo.py which results in log file as:
file: "\home
foo\foo.py"

It's an interesting exercise... Assuming that the structure is really consistent for all log entries, you can try something like this - pretty convoluted, but it works for the example in the question:
ros = """[your log above]"""
def manage_lists_2(log_ind, list_1, list_2, mystr):
if log_ind == 0:
list_1.append(mystr.split(':')[0].strip())
list_2[-log_ind].append(mystr.split(':')[1].strip())
m_keys2 = []
m_key_vals2 = [[],[]]
header_keys2 = []
header_key_vals2 = [[],[]]
stamp_keys2 = []
stamp_key_vals2 = [[],[]]
for log in logs:
for l in log.splitlines():
if l[0]!=" ":
items = [m_keys2, m_key_vals2]
elif l[0:3] != " ":
items = [header_keys2, header_key_vals2]
else:
items = [stamp_keys2, stamp_key_vals2]
manage_lists_2(logs.index(log), items[0], items[1], l)
for val in m_key_vals2:
for a, b, in zip(m_keys2,val):
print(a, ": ",b)
if a == "header":
for header_key in header_keys2:
print('\t',header_key,':',header_key_vals2[m_keys2.index(a)][header_keys2.index(header_key)])
if header_key == "stamp":
for stamp_key in stamp_keys2:
print('\t\t',stamp_key,':',stamp_key_vals2[m_keys2.index(a)][stamp_keys2.index(stamp_key)])
print('---')
Output:
header :
seq : 2
stamp :
secs : 1596526199
nsecs : 140017032
frame_id : 'one id'
level : 2
name : "/replicator_node"
msg : "Replicator node dumping to /tmp/replicator_dumps"
file : "replicator_node.py"
function : "__init__"
line : 218
topics : [/move_mongodb_entries/status, /move_mongodb_entries/goal, /move_mongodb_entries/result, /move_mongodb_entries/cancel, /rosout, /move_mongodb_entries/feedback]
---
header :
seq : 2
stamp :
secs : 1596526199
nsecs : 140017032
frame_id : 'one id'
level : 3
name : "/mongo_server"
msg : "2020-08-04T09
file : "mongodb_server.py"
function : "_mongo_loop"
line : 139
topics : [/rosout]
Having gone through that, I would recommend that - if you are going to do this on a regular basis - you find a way to store the data in xml format; it's a natural fit for it.

Related

iteration over 2d list in python and printing headings for each element

I am trying to work out how to iterate over a list and print out each item with a print statement describing what element is. my project is to create a user management system and print out something similar to the image I have attached.
The output I am trying to produce
The output I am getting
My code:
records = 0
userFirst = ["John"]
userLast = ["Doe"]
autoUsername = ["Johndoe91"]
autoPassword = ["123456789"]
hiddenPassword = ["*****789"]
userRole = ["User"]
userDept = ["Administration"]
users = []
confidentialUserDetails = []
users.append(userFirst + userLast + userRole + userDept + autoUsername + autoPassword)
confidentialUserDetails.append(users)
for row in range(len(confidentialUserDetails)):
records += 1
print("-" * 25)
print("Record: ", records)
for col in range(len(confidentialUserDetails[row])):
print(confidentialUserDetails[row][col])
Any help would be greatly appreciated. :)
Your data structures are unusual. I'm assuming that those lists are going to be provided to your code somehow and will, in practice, have multiple user details appended to them so that they are all the same length.
Anyhow, you can achieve the output you're looking for with some readable f-strings like this:
from functools import reduce
userFirst = ["John"]
userLast = ["Doe"]
autoUsername = ["Johndoe91"]
autoPassword = ["123456789"]
hiddenPassword = ["*****789"]
userRole = ["User"]
userDept = ["Administration"]
for row in range(len(userFirst)):
s = (f"""\
Name : {userFirst[row]} {userLast[row]}
Role : {userRole[row]}
Department : {userDept[row]}
Username : {autoUsername[row]}
Password : {hiddenPassword[row]}""")
maxlen = reduce(lambda x,y: max(x, len(y)), s.split("\n"), 0)
print(f"{s}\n{'-'*maxlen}\n")
Output:
Name : John Doe
Role : User
Department : Administration
Username : Johndoe91
Password : *****789
------------------------------
I created a dictionary called user instead of your list and after that I appended it to the second list and finally I printed the key and the value of the dictionary.
Also to get the full name I joined userFirst and userLast as string.
Code:
records = 0
userFirst = ["John"]
userLast = ["Doe"]
autoUsername = ["Johndoe91"]
autoPassword = ["123456789"]
hiddenPassword = ["*****789"]
userRole = ["User"]
userDept = ["Administration"]
confidentialUserDetails = [] # 2d list for asterisked passwords
users={'Name' : [' '.join(userFirst + userLast)] ,'Role' : userRole , 'Departement' : userDept ,'Username' : autoUsername ,'Password' : hiddenPassword }
confidentialUserDetails.append(users)
for user in confidentialUserDetails:
records += 1
print("-" * 25)
print("Record: ", records)
for ele,v in user.items():
print(ele,':',v[0])
Output:
-------------------------
Record: 1
Name : John Doe
Role : User
Departement : Administration
Username : Johndoe91
Password : *****789
Using a dictionary or f strings like the two other answers suggested is probably the best. But if you just want to use your current code to print your desired output, you can simply grab each item by its index number in your print statement.
Change the line:
print(confidentialUserDetails[row][col])
To something like this:
print("Name : ", confidentialUserDetails[row][col][0], confidentialUserDetails[row][col][1])
print("Role: : ", confidentialUserDetails[row][col][2])
Output:
-------------------------
Record: 1
Name : John Doe
Role: : User

create dict keys depending on the number of times the same value occurs

I have a dict as below, if the same value is found more tahn once then the dict key must be created with incremental numbering.
TT = {
"TYPE_1" : ['ERROR'],
"TYPE_2": ['FATAL'],
"TYPE_3" : ["TIME_OUT"],
"TYPE_4" : ['SYNTAX'],
"TYPE_5" : ['COMPILE'],
}
m1 = "ERROR the input is not proper"
m2 = "This should have not occured FATAL"
m3 = "Sorry TIME_OUT"
m4 = "SYNTAX not proper"
m5 = "u r late its TIME_OUT"
The value "TIME_OUT" occur twice in the search.
count = 0
for key in TT.keys():
print(key)
Key_1 = key
while key_1 in TT:
count = count+1
key_1 = key + "_{}".format(count)
The above code gives error Key_1 not defined.
Expected OUTPUT:
if the same value is occuring more than once then the dict key should be created with incremental numbering "TYPE_3_1" : ["TIME_OUT"],
TT = {
"TYPE_1" : ['ERROR'],
"TYPE_2": ['FATAL'],
"TYPE_3" : ["TIME_OUT"],
"TYPE_3_1" : ["TIME_OUT"],
"TYPE_4" : ['SYNTAX'],
"TYPE_5" : ['COMPILE'],
}
Please suggest on this.
There can be a much efficient way of solving this if you can rethink about some of the data structure but if that is not an option you may be able to try this.
inputs = ["ERROR the input is not proper",
"This should have not occured FATAL",
"Sorry TIME_OUT",
"SYNTAX not proper",
"u r late its TIME_OUT"]
basic_types = {
"TYPE_1" : ['ERROR'],
"TYPE_2": ['FATAL'],
"TYPE_3" : ["TIME_OUT"],
"TYPE_4" : ['SYNTAX'],
"TYPE_5" : ['COMPILE'],
}
type_counts = {}
results = {}
for sentence in inputs:
for basic_type in basic_types:
if basic_types.get(basic_type)[0] in sentence:
type_counts[basic_type] = type_counts.get(basic_type, 0) + 1
if type_counts[basic_type] == 1:
results[basic_type] = [basic_types.get(basic_type)[0]]
else:
results[basic_type+"_{}".format(type_counts[basic_type] - 1)] = [basic_types.get(basic_type)[0]]
print(results)

Empty json object when read text file

I'm testing a python script with text data. Able to run the script and return valid json file if the text include in the script but I got empty json object when run the script and with separate text file.
The output only empty json file
{
"ospf": []
}
The code below return empty json object when run it with read text file
import json
result = {}
l = []
with open('data.txt') as myf:
for i in myf:
if i:
p = [parameter for parameter in i.split("*")]
for line, x in enumerate(p[0].split("\n")):
if x and "Ls id" in x:
ls_id, ip = x.split(": ")
ls_id = ls_id.strip()
ip = ip.strip()
for y in p[1:]:
if y and "P-2-P" in y:
temp = {ls_id:ip}
for items in y.split("\n"):
try:
key, value = items.split(": ")
key = key.strip()
value = value.strip()
temp[key] = value
except ValueError:
pass
l.append(temp)
result["ospf"] = l
print (json.dumps(result,indent=2))
with open('data.json', 'w') as json_file:
json.dump(result, json_file)
When executed the code below ok with the text data include as data..no problem
import json
data = '''
Type : Router
Ls id : 1.1.1.2
Adv rtr : 1.1.1.2
Ls age : 201
Len : 84
Link count: 5
* Link ID: 1.1.1.2
Data : 255.255.255.255
Link Type: StubNet
Metric : 1
Priority : Medium
* Link ID: 1.1.1.4
Data : 192.168.100.34
Link Type: P-2-P
Metric : 1
* Link ID: 192.168.100.33
Data : 255.255.255.255
Link Type: StubNet
Metric : 1
Priority : Medium
* Link ID: 1.1.1.1
Data : 192.168.100.53
Link Type: P-2-P
Metric : 1
* Link ID: 192.168.100.54
Data : 255.255.255.255
Link Type: StubNet
Metric : 1
Priority : Medium
Type : Router
Ls id : 1.1.1.1
Adv rtr : 1.1.1.1
Ls age : 1699
Len : 96
Options : ASBR E
seq# : 80008d72
chksum : 0x16fc
Link count: 6
* Link ID: 1.1.1.1
Data : 255.255.255.255
Link Type: StubNet
Metric : 1
Priority : Medium
* Link ID: 1.1.1.1
Data : 255.255.255.255
Link Type: StubNet
Metric : 12
Priority : Medium
* Link ID: 1.1.1.3
Data : 192.168.100.26
Link Type: P-2-P
Metric : 10
* Link ID: 192.168.100.25
Data : 255.255.255.255
Link Type: StubNet
Metric : 10
Priority : Medium
* Link ID: 1.1.1.2
Data : 192.168.100.54
Link Type: P-2-P
Metric : 10
* Link ID: 192.168.100.53
Data : 255.255.255.255
Link Type: StubNet
Metric : 10
Priority : Medium'''
import json
result = {}
l = []
for i in data.split("\n\n"):
if i:
p = [parameter for parameter in i.split("*")]
for line, x in enumerate(p[0].split("\n")):
if x and "Ls id" in x:
ls_id, ip = x.split(": ")
ls_id = ls_id.strip()
ip = ip.strip()
for y in p[1:]:
if y and "P-2-P" in y:
temp = {ls_id:ip}
for items in y.split("\n"):
try:
key, value = items.split(": ")
key = key.strip()
value = value.strip()
temp[key] = value
except ValueError:
pass
l.append(temp)
result["ospf"] = l
print (json.dumps(result,indent=2))
with open('data.json', 'w') as json_file:
json.dump(result, json_file)
I'm not sure where i make wrong. Please advise me further. Thank you.
A simple workaround would be to concat the file to one large string. then your code works as expected. This is definetly no clean answer but you could let the rest of your code unchanged.
import json
result = {}
l = []
with open('data.txt') as myf:
a = ''.join(myf)
for i in a.split("\n\n"):
if i:
p = [parameter for parameter in i.split("*")]
for line, x in enumerate(p[0].split("\n")):
if x and "Ls id" in x:
ls_id, ip = x.split(": ")
ls_id = ls_id.strip()
ip = ip.strip()
for y in p[1:]:
if y and "P-2-P" in y:
temp = {ls_id:ip}
for items in y.split("\n"):
try:
key, value = items.split(": ")
key = key.strip()
value = value.strip()
temp[key] = value
except ValueError:
pass
l.append(temp)
result["ospf"] = l
print (json.dumps(result,indent=2))
with open('data.json', 'w') as json_file:
json.dump(result, json_file)

Python 'key error' while building dictionary dynamically (On the fly)

I get the error onthis line of code -
result_dict['strat'][k]['name'] = current_comps[0].strip()
The error is : Keyerror: 'strat'
I have an input line
PERSON1 ## CAR1 # ENTRY : 0 | EXIT : 0 ## CAR2 # M1 : YES : 10/01/17 02:00 | M2 : NO : 10/02/16 03:00 | M3 : NO : 05/07/17 11:00 | M4 : YES : 01/01/16 03:00 ## TRUCK # M3 : NO : 03/01/17 03:45 | M23 : NO : 01/01/14 07:00 | M27 : YES : 02/006/18 23:00
I 'm looking to parse this input to generate the output detailed below. As part of this, I'm trying to build a dictionary inserting both keys & values dynamically. I'm having a lot of problems doing this.
Could I please request help on this?
Here is what I've tried so far -
# File read
f = open('input_data', 'r')
file_cont = f.read().splitlines()
f.close()
#json template
# Initialize dictionary
result_arr = []
result_dict = {}
k = 0
for item in file_cont:
strat = item.split('##')
result_dict['Person'] = strat[0].strip()
j = 1
while j < len(strat):
# Split various components of the main line
current_comps = strat[j].split('#')
# Name of strat being parsed
result_dict['strat'][k]['name'] = current_comps[0].strip()
# tfs across the various time frames
tfs = current_comps[1].split('|')
# First travel mode
if current_comps[0].strip() == 'CAR1':
temp_low_arr = tfs[0].split(':')
temp_high_arr = tfs[1].split(':')
result_dict['strat'][k]['Entry'] = temp_low_arr[1].strip()
result_dict['strat'][k]['Exit'] = temp_high_arr[1].strip()
# Second travel mode
elif current_comps[0].strip() == 'CAR2':
z = 0
while z < len(tfs):
# Split components of the sign
sign_comp_car_2 = tfs[z].split(':')
result_dict['strat'][k]['tf'][z]['path'] = sign_comp_ma_cross[0].strip()
result_dict['strat'][k]['tf'][z]['sign'] = sign_comp_ma_cross[1].strip()
result_dict['strat'][k]['tf'][z]['sign_time'] = sign_comp_ma_cross[2].strip()
z += 1
# Third travel mode
elif current_comps[0].strip() == 'CAR3':
b = 0
while b < len(tfs):
# Split components of the sign
sign_car_3 = tfs[z].split(':')
result_dict['strat'][k]['tf'][b]['path'] = sign_all_term[0].strip()
result_dict['strat'][k]['tf'][b]['sign'] = sign_all_term[1].strip()
result_dict['strat'][k]['tf'][b]['sign_time'] = sign_all_term[2].strip()
b += 1
j += 1
k += 1
Expected output
[{
"Person":"",
"Transport":[
{
"Name":"CAR1",
"Entry":"0",
"Exit":"0"
},
{
"name":"CAR2:",
"tf":[
{
"path":"M1",
"sign":"YES",
"sign_time":"10/01/17 02:00"
},
{
"path":"M2",
"sign":"NO",
"sign_time":"10/02/16 03:00"
},
{
"path":"M3",
"sign":"NO",
"sign_time":"05/07/17 11:00"
},
{
"path":"M4",
"sign":"YES",
"sign_time":"01/01/16 03:00"
}
]
},
{
"name":"CAR3",
"tf":[
{
"path":"M3",
"sign":"NO",
"sign_time":"03/01/17 03:45"
},
{
"path":"M23",
"sign":"NO",
"sign_time":"01/01/14 07:00"
},
{
"path":"M27",
"sign":"Yes",
"sign_time":"02/006/18 23:00"
}
]
}
]
}]
The issue is when you try to assign the ['name'] field in result_dict['strat'][k] when result_dict['strat'][k] hasn't been initialized yet. Before you run your for-loop, the dictionary has no key called strat.
Now you could have done something like result_dict['strat'] = dict() (assigning an object to that key in the dict), but when you further subscript it using result_dict['strat'][k], it will try to resolve that first, by accessing result_dict['strat'], expecting either a subscriptable collection or a dictionary in return. However, since that key doesn't exist yet, it throws you the error.
What you could do instead is initialize a default dictionary:
from collections import defaultdict
...
resultdict = defaultdict(dict)
...
Otherwise, in your existing code, you could initialize a dict within result_dict before entering the loop.

Doing operations on a large data set

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

Categories

Resources