This program scans through a log file and finds faults and timestamps for the faults. The problem I am having with my program is finding a way to modify my program so that it can iterate over multiple files given via the command line and wildcard. In the state the code is now, it can accept a single file and build the dictionary with my my desired info successfully. I have been struggling finding a way to perform this with multiple files simultaneously. The goal is to able to enter into the command line the filename with a wildcard to parse files associated. For example on the command line after the executable I would enter, -f filename.*txt**. However, I cannot find a way to parse multiple files through my fault finder. I have been successful in parsing multiple files and proved it by printing out the list of files parsed. But when it comes to using multiple files and building the dictionary, I am stumped. I would like to use my program and have the same result as it would when parsing a singular file.
import sys
import argparse
_TIME_STAMP_LENGTH = 16
_FAULT_STRING_HEADER_LENGTH = 15
class FaultList():
fault_dict = {}
fault_dict_counter = {}
def __init__(self, file):
self.file = file
self.find_faults()
print self.fault_dict
def find_faults(self):
with open(self.file) as f:
for line in f.readlines():
fault_index = line.find("Fault Cache id")
if(fault_index != -1):
time_stamp = line[:_TIME_STAMP_LENGTH]
fault_data = line[fault_index+_FAULT_STRING_HEADER_LENGTH:-11][:-1] #need the [:-1] to remove new line from string
self.handle_new_fault_found(fault_data, time_stamp)
def handle_new_fault_found(self, fault, time_stamp):
try:
self.fault_dict[fault] = [fault]
self.fault_dict[fault].append(int(time_stamp))
self.fault_dict_counter[0] += 1
except KeyError:
self.fault_dict_counter[fault] = [1, [time_stamp]]
def main(file_names):
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", dest="file_names",
help="The binary file to be writen to flash")
args = parser.parse_args()
fault_finder = FaultList(args.file_names)
args = parser.parse_args()
if __name__ == '__main__':
main(sys.argv[1:])
Here is the output of dictionary when parsing a singular file
{'fault_01_17_00 Type:Warning': ['fault_01_17_00 Type:Warning', 37993146319], 'fault_0E_00_00 Type:Warning': ['fault_0E_00_00 Type:Warning', 38304267561], 'fault_05_01_00 Typ': ['fault_05_01_00 Typ', 38500887160]}
You can use the os module for listing files.
import os
# finds all files in a directory
files = [file for file in os.listdir('path of the files') if os.path.isfile(file)]
# filter them looking for files that end with 'txt'
txt_files = [file for file in files if file.endswith('txt')]
Related
Hello I'm using a python script that consist in the following code:
from Bio import SeqIO
# set input file, output file to write to
gbk_file = "bin.10.gbk"
tsv_file = "results.bin_10.tsv"
cluster_out = open(tsv_file, "w")
# Extract CLuster info. write to file
for seq_record in SeqIO.parse(gbk_file, "genbank"):
for seq_feat in seq_record.features:
if seq_feat.type == "protocluster":
cluster_number = seq_feat.qualifiers["protocluster_number"][0].replace(" ","_").replace(":","")
cluster_type = seq_feat.qualifiers["product"][0]
cluster_out.write("#"+cluster_number+"\tCluster Type:"+cluster_type+"\n")
THe issue is that I want to automatize this script to multiple files in a certain directory, in this way I want that gbk_file stores all the files that have .gbk as suffix, and that tsv_file results in a respective output file according to each input file.
so if a input file has the name "bin.10.gbk", the output will be "results.bin_10.tsv".
I tried using glob python function but dont know how to create a tsv_file variable that stores modified strings from imput file names:
import glob
# setting variables
gbk_files = glob.glob("*.gbk")
tsv_files = gbk_files.replace(".gbk",".results.tsv")
cluster_out = open(tsv_files, "w")
making that changes, I got the following error:
AttributeError: 'list' object has no attribute 'replace'
so how can I deal with this?
Thanks for reading :)
Hope the following function can help you.
def processfiles():
for file in glob.glob("*.gbk"):
names = file.split('.')
tsv_file = f'results.{names[-3]}_{names[-2]}.tsv'
with open(tsv_file, 'w') as tsv:
tsv.write('write your content here')
tsv.close
I'm trying to go thru every json file in my current directory and find two specific variables, productId and userProfileId (both are getting well captured on the output file) but cant get it to run for every file in the folder.
This is my best try so far
import json
import csv
import os
KEYS = ['user_id','product_id']
for files in os.walk("."):
for filename in files:
for i in filename:
if i.endswith(".json"):
print(i)
with open(i) as json_data:
order_parsed = json.load(json_data)
products_data = order_parsed['items']
user_data = order_parsed['clientProfileData']
with open('user-item.csv','w') as dataFile:
newFileWriter = csv.writer(dataFile)
newFileWriter.writerow(KEYS)
for item in products_data:
productId = (products_data[0]['productId'])
userId = (user_data["userProfileId"])
print(productId)
print(userId)
newFileWriter.writerow([userId,productId])
To loop though all files in a folder, you can use this for.
for file in os.listdir('folder_path'):
if file[-5:] == ".json":
arq = open(file,'r')
You are doing a dictionary key search under the with loop. Try not doing your search under the with loop by unindenting products_data and user_data once.
Im trying to add the switch -c and specify the config file.
I have it working at the moment using the config.dat but when i use -c and specify a new .dat it uses the default config.dat....
Any idea where im going wrong?
#!/usr/bin/python3
import argparse
import shutil
parser = argparse.ArgumentParser(description='Copy multiple Files from a specified data file')
parser.add_argument('-c', '--configfile', default="config.dat",help='file to read the config from')
def read_config(data):
try:
dest = '/home/admin/Documents/backup/'
#Read in date from config.dat
data = open('config.dat')
#Interate through list of files '\n'
filelist = data.read().split('\n')
#Copy through interated list and strip white spaces and empty lines
for file in filelist:
if file:
shutil.copy(file.strip(), dest)
except FileNotFoundError:
pass
args =parser.parse_args()
read = read_config(args.configfile)
args =parser.parse_args()
Take a close look at what you are doing on line 14. Even though you are retrieving and assigning the --configfile argument to args you are still using a string literal data = open('config.dat') instead of passing data (which is the value of your argument for the configfile passed as an argument to the function read_config):
def read_config(data):
try:
dest = '/home/admin/Documents/backup/'
#Read in date from config.dat
data = open(data)
...
I would also change the naming of the argument data you are passing to read_config-- it's a bit ambiguous. You know that this function expects a file name as an argument so why not simply call it filename.
def read_config(filename):
import pdb; pdb.set_trace()
try:
dest = '/home/admin/Documents/backup/'
#Read in date from config.dat
data = open(filename)
#Interate through list of files '\n'
filelist = data.read().split('\n')
#Copy through interated list and strip white spaces and empty lines
for file in filelist:
if file:
shutil.copy(file.strip(), dest)
except FileNotFoundError:
pass
This code works by converting the args to a dictionary, then getting the value via key. Also, the code you had on line 13 didn't open the passed in value. This one opens the passed in file. See if this works for you:
# !/usr/bin/python3
import argparse
import shutil
parser = argparse.ArgumentParser(description='Copy multiple Files from a specified data file')
parser.add_argument('-c', '--configfile', default="config.dat", help='file to read the config from')
def read_config(data):
try:
dest = '/home/admin/Documents/backup/'
# Read in date from config.dat
data = open(data)
# Interate through list of files '\n'
filelist = data.read().split('\n')
# Copy through interated list and strip white spaces and empty lines
for file in filelist:
if file:
shutil.copy(file.strip(), dest)
except FileNotFoundError:
pass
args = vars(parser.parse_args())
read = read_config(args['configfile'])
Make proper use of the function argument; names changed to clarify the nature of the variables.
def read_config(filename='config.dat'):
try:
dest = '/home/admin/Documents/backup/'
afile = open(filename)
#Interate through list of files '\n'
filelist = afile.read().split('\n')
#Copy through interated list and strip white spaces and empty lines
for file in filelist:
if file:
shutil.copy(file.strip(), dest)
except FileNotFoundError:
pass
Im reading a file and checking the timestamps of the entries in that file... I want to check timestamps and if the time stamp has been modified from files with {/home/admin/backup}
#!/usr/bin/python3
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument('-c', default="config.dat", help='file to read the config from')
args = parser.parse_args()
#this reads and prints times okay
f = open('config.dat','r')
list_contents = f.read().split('\n')
for a in list_contents:
b = os.path.getmtime(a)
#problem is here.
cache = dict((a, os.path.getmtime(a)) for a in list_contents)
print('cache')
#if file recently modified then do this next code:....
I have multiple log files that contain 10000+ lines of info and are Gzipped. I need a way to quickly parse each log file for relevant information and then display stats based on the information contained in all the log files. I currently use gzip.open() to recursively open each .gz file and then run the contents through a primitive parser.
def parse(logfile):
for line in logfile:
if "REPORT" in line:
info = line.split()
username = info[2]
area = info[4]
# Put info into dicts/lists etc.
elif "ERROR" in line:
info = line.split()
...
def main(args):
argdir = args[1]
for currdir, subdirs, files in os.walk(argdir):
for filename in files:
with gzip.open(os.path.join(currdir, filename), "rt") as log:
parse(log)
# Create a report at the end: createreport()
Is there any way to optimize this process for each file? It currently takes ~28 seconds per file on my computer to go through each .gz and every little optimization counts. I've tried using pypy and for some reason it takes 2 times longer to process a file.