Parse multiple log files for strings - python

I'm trying to parse a number of log files from a log directory, to search for any number of strings in a list along with a server name. I feel like I've tried a million different options, and I have it working fine with just one log file.. but when I try to go through all the log files in the directory I can't seem to get anywhere.
if args.f:
logs = args.f
else:
try:
logs = glob("/var/opt/cray/log/p0-current/*")
except IndexError:
print "Something is wrong. p0-current is not available."
sys.exit(1)
valid_errors = ["error", "nmi", "CATERR"]
logList = []
for log in logs:
logList.append(log)
#theLog = open("logList")
#logFile = log.readlines()
#logFile.close()
#printList = []
#for line in logFile:
# if (valid_errors in line):
# printList.append(line)
#
#for item in printList:
# print item
# with open("log", "r") as tmp_log:
# open_log = tmp_log.readlines()
# for line in open_log:
# for down_nodes in open_log:
# if valid_errors in open_log:
# print valid_errors
down_nodes is a pre-filled list further up the script containing a list of servers which are marked as down.
Commented out are some of the various attempts I've been working through.
logList = []
for log in logs:
logList.append(log)
I thought this may be the way forward to put each individual log file in a list, then loop through this list and use open() followed by readlines() but I'm missing some kind of logic here.. maybe I'm not thinking correctly.
I could really do with some pointers here please.
Thanks.

So your last for loop is redundant because logs is already a list of strings. With that information, we can iterate through logs and do something for each log.
for log in logs:
with open(log) as f:
for line in f.readlines():
if any(error in line for error in valid_errors):
#do stuff
The line if any(error in line for error in valid_errors): checks the line to see if any of the errors in valid_errors are in the line. The syntax is a generator that yields error for each error in valid_errors.
To answer your question involving down_nodes, I don't believe you should include this in the same any(). You should try something like
if any(error in line for error in valid_errors) and \
any(node in line for node in down_nodes):

Firstly you need to find all logs:
import os
import fnmatch
def find_files(pattern, top_level_dir):
for path, dirlist, filelist in os.walk(top_level_dir):
for name in fnmatch.filter(filelist, pattern)
yield os.path.join(path, name)
For example, to find all *.txt files in current dir:
txtfiles = find_files('*.txt', '.')
Then get file objects from the names:
def open_files(filenames):
for name in filenames:
yield open(name, 'r', encoding='utf-8')
Finally individual lines from files:
def lines_from_files(files):
for f in files:
for line in f:
yield line
Since you want to find some errors the check could look like this:
import re
def find_errors(lines):
pattern = re.compile('(error|nmi|CATERR)')
for line in lines:
if pattern.search(line):
print(line)
You can now process a stream of lines generated from a given directory:
txt_file_names = find_files('*.txt', '.')
txt_files = open_files(txt_file_names)
txt_lines = lines_from_files(txt_files)
find_errors(txt_lines)
The idea to process logs as a stream of data originates from talk by David Beazley.

Related

How to find first and last characters in a file using python?

I am stuck on this revision exercise which asks to copy an input file to an output file and return the first and last letters.
def copy_file(filename):
input_file = open(filename, "r")
content = input_file.read()
content[0]
content[1]
return content[0] + content[-1]
input_file.close()
Why do I get an error message which I try get the first and last letters? And how would I copy the file to the output file?
Here is the test:
input_f = "FreeAdvice.txt"
first_last_chars = copy_file(input_f)
print(first_last_chars)
print_content('cure737.txt')
Error Message:
FileNotFoundError: [Errno 2] No such file or directory: 'hjac737(my username).txt'
All the code after a return statement is never executed, a proper code editor would highlight it to you, so I recommend you use one. So the file was never closed. A good practice is to use a context manager for that : it will automatically call close for you, even in case of an exception, when you exit the scope (indentation level).
The code you provided also miss to write the file content, which may be causing the error you reported.
I explicitely used the "rt" (and "wt") mode for the files (althought they are defaults), because we want the first and last character of the file, so it supports Unicode (any character, not just ASCII).
def copy_file(filename):
with open(filename, "rt") as input_file:
content = input_file.read()
print(input_file.closed) # True
my_username = "LENORMJU"
output_file_name = my_username + ".txt"
with open(output_file_name, "wt") as output_file:
output_file.write(content)
print(output_file.closed) # True
# last: return the result
return content[0] + content[-1]
print(copy_file("so67730842.py"))
When I run this script (on itself), the file is copied and I get the output d) which is correct.

python replacement for xml

I have file, FF_tuningConfig_AMPKi.xml, contains of records such as:
<KiConfig active="%{active}" id="AMP_RET_W_LIN_SUSPICIOUS_MULTIPLE_LOGIN_IN_SHORT_PERIOD$KiConfig"/>
<KiConfig active="%{active}" id="AMP_RET_W_LIN_UNUSUAL_SESSION_HOUR_OF_DAY$KiConfig"/>
I have the following code:
def replace_content(path,se,search,String_Replace):
for root, dirs, files in os.walk(path):
for filename in files:
if((se in filename)):
file=open(os.path.join(root, filename),'r')
lines = file.readlines()
file=open(os.path.join(root, filename),'w')
for line in lines:
if search in line:
#print "found="+line
words=line.split('=')
# print words
# print "line=" + words[0] +"="+ "8\n"
line=line.replace(line,String_Replace)
#print "after="+line
file.write(line)
file.close()
print (os.path.join(root,filename) + " was replaced")
replace_content(Path,'FF_tuningConfig_AMPKi.xml','<KiConfig active="%{active}"','<KiConfig active="true"')
I am getting the below:
active="true" <Thresholds>
Instead of:
<KiConfig active="true" id="AMP_RET_W_LIN_UNUSUAL_SESSION_HOUR_OF_DAY$KiConfig"/>
Your problem is with line=line.replace(line,String_Replace). Take a look at the documentation for str.replace()
line = line.replace(search,String_Replace)
To test your code, you could have written a separate script with only the part that seemed to be failing.
# test input
s = '''<KiConfig active="%{active}" id="AMP_RET_W_LIN_SUSPICIOUS_MULTIPLE_LOGIN_IN_SHORT_PERIOD$KiConfig"/>
<KiConfig active="%{active}" id="AMP_RET_W_LIN_UNUSUAL_SESSION_HOUR_OF_DAY$KiConfig"/>'''
lines = s.split('\n')
# parameters
search, String_Replace = '<KiConfig active="%{active}"','<KiConfig active="true"'
# Then the part of your code that seems to be failing
for line in lines:
if search in line:
line = line.replace(line, String_Replace)
print(line)
That lets you focus on the problem and makes it easy and fast to modify then test your code. Once you have that functionality working, copy and paste it into your working code. If that part of your code actually works then you have eliminated it as a source for errors and you can test other parts.
As an aside, no need to test if your search string is in the line before attempting to replace. If the search string isn't in the line, str.replace() will return the line without modification.

Splitting FASTA sequence using Python

The following error message appears when I try to run the following script:
Background: I am trying to split up a large FASTA file (~45Mb) into smaller files based on gene id. I would like to chop it everytime the ">" appears. The following .py script allows me to do so. However, everynow and then I get the following error. Any feedback would be greatly appreciated.
Script:
import os
os.chdir("/vmb/Flavia_All/Python_Commands")
outfile = os.chdir("/vmb/Flavia_All/Python_Commands")
import sys
infile = open(sys.argv[1])
outfile = []
for line in infile:
if line.startswith(">"):
if (outfile != []): outfile.close()
genename = line.strip().split('|')[1]
filename = genename+".fasta"
outfile = open(filename,'w')
outfile.write(line)
else:
outfile.write(line)
outfile.close()
Error Message when script is run:
Traceback (most recent call last):
File "splitting_fasta.py", line 14, in <module>
outfile = open(filename,'w')
IOError: [Errno 2] No such file or directory: 'AY378100.1_cds_AAR07818.1_173 [gene=pbrB/pbrC] [protein=PbrB/PbrC] [protein_id=AAR07818.1] [location=complement(152303..153451)].fasta'
*NOTE: AY378100.1_cds_AAR07818.1 is one of many genes in this FASTA sequence. This is not the only gene i have had the same message appear with. I would like to stop deleting each gene that comes up with this message.
It seems that some of the fasta "names" (the first description line with the >) contains too much. In particular contains some characters not allowed in file names.
If names like the GenBank ID - AY378100 are sufficiently unambiguous then:
genename = line.strip().split('|')[1].split('.')[0]
may be OK. If you have many fasta sequences with the same ID you probably will be OK with:
genename = line.strip().split('|')[1].split('[')[0].strip()

Python text parsing and saving as html

I've been playing around with Python trying to write a script to scan a directory for specific files, finding certain keywords and saving the lines where these keywords appear into a new file. I came up with this;
import sys, os, glob
for filename in glob.glob("./*.LOG"):
with open(filename) as logFile:
name = os.path.splitext(logFile.name)[0]
newLOG = open(name + '_ERROR!'+'.LOG', "w")
allLines = logFile.readlines()
logFile.close()
printList = []
for line in allLines:
if ('ERROR' in line) or ('error' in line):
printList.append(line)
for item in printList:
# print item
newLOG.write(item)
This is all good but I thought I'd try instead saving this new file as html wrapping it all in the rights tags(html,head,body...) so that maybe I could change the font colour of the keywords. So far it looks like this;
import sys, os, glob
for filename in glob.glob("./*.LOG"):
with open (filename) as logFile:
name = os.path.splitext(logFile.name)[0]
newLOG = open(name + '_ERROR!'+'.html', "w")
newLOG.write('<html>')
newLOG.write('<head>')
newLOG.write('<body><p>')
allLines = logFile.readlines()
logFile.close()
printList = []
for line in allLines:
if ('ERROR' in line) or ('error' in line):
printList.append(line)
for item in printList:
# print item
newLOG.write('</html>')
newLOG.write('</head>')
newLOG.write('</body><p>')
newLOG.write(item)
Now the problem is I'm new to this and I'm still trying to figure out how to work with indentations and loops.. Because my html tags are being appended from within the loop, every line has the <html>, <head> & <body><p> tag around them and it just looks wrong. I understand the problem and have tried rewriting things so that the tags are applied outside the loop but I've not had much success.
Could someone show me a better way of getting the file name of the current file, creating a new file+appending it as I think this is why I'm getting the file handling errors when trying to change how it all works.
Thanks
It's a matter of indenting the lines to the right level. The HTML footer must be printed at the indentation level of the header lines, not indented within the loop. Try this:
import sys, os, glob
import cgi
for filename in glob.glob("./*.LOG"):
name = os.path.splitext(filename)[0]
with open(filename, 'r') as logFile, open('%s_ERROR!.html' % name, 'w') as outfile:
outfile.write("<html>\n<head>\n</head>\n<body><p>")
allLines = logFile.readlines()
printList = []
for line in allLines:
if ('ERROR' in line) or ('error' in line):
printList.append(line)
for item in printList:
# Note: HTML-escape value of item
outfile.write(cgi.escape(item) + '<br>')
outfile.write("</p></body>\n</html>")
Note that you don't need to use printList - you could just emit the HTML code as you go through the log.
Consider breaking this into smaller functions for reusability and readability.

problem with seek() in file handling

I am using seek in a file- the file has a bunch of filenames and some logs of a process done on the file- some of these logs have errors. I am going line by line, if i get an error, I want to log everything in between two filenames.
When I use seek, I think that instead of moving it to the line i want it to, it moves it to the character # instead. For example
f=open("fileblah",'r')
while f:
line=f.readline()
counter=counter+1
f.seek(tail_position) # i want the next loop to start from after the error happened.
if line.startswith("D:")
header_position=counter
error_flag=0 #unset error flag
if line.startswith("error")
error_flag=1 #set error_flag
while(not(line.startswith("D:"): #go until next file beginning
line=f.readline()
counter=counter+1
tail_position=counter #have come to the next filename
I can see this is highly inefficient, but it doesn't work at all, because f.seek(tail_position) is moving the file pointer to the character # instead of the line #
Use .tell() to store your start-of-line position, then you can .seek() back to it.
Edit: I think this is what you want:
def errorsInLog(fname, newfileStr='D:', iserrorStr='error'):
with open(fname) as inf:
prev = pos = inf.tell()
line = inf.readline()
error = False
while line:
if line.startswith(newfileStr):
if error:
inf.seek(prev)
yield(inf.read(pos-prev))
prev = pos
error = False
elif line.startswith(iserrorStr):
error = True
pos = inf.tell()
line = inf.readline()
if error:
inf.seek(prev)
yield(inf.read())
def main():
print('\n\n'.join(errorsInLog('fileblah')))
For each filename followed by an error it returns a string encompassing the filename and all following lines, up to but not including the next filename or end-of-file.
seek() is used more often in random access file reading. If the file being read is already text and in can be read line by line then you need only read the line and then operate on the line using string operations. There is no need to move the file read position.
Your code need only look like this:
for line in f:
do_stuff_with line
like stdio's fseek(),seek(offset[,whence]) sets offset of the current position. whence is defaults to 0.so you can do something like this:
while(not(line.startwith("D:"))):
fseek(tail_position,'\n')
tail_position ++

Categories

Resources