How to solve invalid character in identifier error - python

So I am making a discord bot ,
I saw this code on a stackoverflow question
from datetime import datetime, timedelta
now = datetime.now() # a datetime.datetime objekt
last_claim_stamp = str(now.timestamp()) # save this into json
​last_claim=datetime.fromtimestamp(float(last_claim_stamp))
delta = now - last_claim # the timedelta between now and the last claim
​if delta > timedelta(hours=48): # if last claim is older than 48h; 24h until he can re use the command + 24h time to claim his daily money again = 48h
​streak = 1 # reset the streak
else:
​streak += 1
#client.command()
#commands.check(user)
#commands.cooldown(1, 86400, commands.BucketType.user)
async def daily(ctx):
​with open("streak.json", "r") as f:
​data = json.load(f)
​streak = data[f"{ctx.author.id}"]["streak"]
​last_claim_stamp = data[f"{ctx.author.id}"]["last_claim"]
​last_claim = datetime.fromtimestamp(float(last_claim_stamp)
​now = datetime.now()
​delta = now - last_claim
​if delta > timedelta(hours=48):
​print("reset streak")
​streak = 1
​else:
​print("increase streak")
​streak += 1
​daily = 45 + (streak * 5)
​amount_after = data[f"{ctx.author.id}"]["balance"] + daily
​data[f"{ctx.author.id}"]["streak"] = streak
​data[f"{ctx.author.id}"]["balance"] += daily
​data[f"{ctx.author.id}"]["last_claim"] = str(now.timestamp())
​with open("streak.json", "w") as f:
​json.dump(data, f, indent=2)
​embed = discord.Embed(title="Daily", colour=random.randint(0, 0xffffff), description=f"You've claimed your daily of **{daily}**, now you have **${amount_after}**")
​embed.set_footer(text=f"Your daily streak: {streak}")
​await ctx.send(embed=embed)
I got the error here -
File "main.py", line 1148
​last_claim=datetime.fromtimestamp(float(last_claim_stamp))
^
SyntaxError: invalid character in identifier
I retyped code the starting part where is the error but still no success please help.I used another interpreter , it showed the same error but at a different place i.e. (last_claim_stamp)

Your code has an invisible Unicode character at the beginning of the name last_claim, specifically U+200B ZERO WIDTH SPACE
Indeed, the whole section of code seems to have U+200B characters at the beginning of most lines.
How to resolve this depends more on your editor / IDE than Python; you will need to figure out how to do a search and replace to remove them all, and possibly also some method of viewing them (although if it starts working, you can probably call it done). Depending on how they got there, you may also need to avoid using the same method in the future, or change settings so the U+200B characters don't get generated.

Related

Calculating Total Seconds with Time Module

I have a document that contains several time measurements that I want to average, so I'm converting minutes and seconds to total seconds. The file looks something like:
Boring text
time 15:07
Right now I can get there with the following:
if line.startswith('time') :
rtminutes = re.findall('([0-9][0-9]):', line)
for min in rtminutes :
rtmintosec = float(min) * 60
rtseconds = re.findall(':([0-9][0-9])', line)
for sec in rtseconds :
totsecs = rtmintosec + float(sec)
print ("Total roast time in seconds:", totsecs)
It seems like the better way would be using time and total_seconds.
import time
if line.startswith('time') :
rt = re.search('\d{2}:\d{2}', line)
rtime = time.strftime(rt, '%M:%S')
rtime.total_seconds()
Every time I've attempted a time approach, I get errors along the lines of:
TypeError: strftime() argument 1 must be str, not re.Match
I'm obviously missing the boat somewhere. Suggestions appreciated.
You need to make sure a match was found, and if so extract the matched string from the re.Match object.
rt = re.search('\d{2}:\d{2}', line)
if rt: # if rt is None then rt.group() will raise an exception
rtime = time.strftime(rt.group(), '%M:%S')
rtime.total_seconds()
hi i tested with this code i just removed the \n and its working for me
#line is your line number where the tims is time 15:07
#Boringtext is the file name
from colorama import Fore, Style
import linecache as s
import time
from datetime import timedelta
from datetime import datetime
def file_len(fname , line):
return s.getline(fname,line)
line = file_len('Boringtext',2)
if line.startswith('time') :
rt = line.split(' ')
rtime = datetime.strptime(rt[1].rstrip("\n"), '%M:%S')
t = timedelta(minutes = rtime.minute , seconds = rtime.second )
print(Fore.BLUE,t.total_seconds(),Style.RESET_ALL)

Matching date and time in 12hrs format

I am following this tutorial (https://towardsdatascience.com/build-your-own-whatsapp-chat-analyzer-9590acca9014) to build a WhatsApp analyzer.
This is the entire code from his tutorial -
def startsWithDateTime(s):
pattern = '^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)(\d{2}|\d{4}), ([0-9][0-9]):([0-9][0-9]) -'
result = re.match(pattern, s)
if result:
return True
return False
def startsWithAuthor(s):
patterns = [
'([\w]+):', # First Name
'([\w]+[\s]+[\w]+):', # First Name + Last Name
'([\w]+[\s]+[\w]+[\s]+[\w]+):', # First Name + Middle Name + Last Name
'([+]\d{2} \d{5} \d{5}):', # Mobile Number (India)
'([+]\d{2} \d{3} \d{3} \d{4}):', # Mobile Number (US)
'([+]\d{2} \d{4} \d{7})' # Mobile Number (Europe)
]
pattern = '^' + '|'.join(patterns)
result = re.match(pattern, s)
if result:
return True
return False
def getDataPoint(line):
# line = 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?
splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
if startsWithAuthor(message): # True
splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
author = splitMessage[0] # author = 'Loki'
message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
else:
author = None
return date, time, author, message
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
conversationPath = 'chat.txt'
with open(conversationPath, encoding="utf-8") as fp:
fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)
messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
while True:
line = fp.readline()
if not line: # Stop reading further if end of file has been reached
break
line = line.strip() # Guarding against erroneous leading and trailing whitespaces
if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
print('true')
if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
messageBuffer.append(message) # Append message to buffer
else:
messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer
When I plug in my chat file - chat.txt, I noticed that my list parsedData is empty. After going through his code, I noticed what might be responsible for the empty list.
From his tutorial, his chats are in this format (24hrs) -
18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner? but my chats are this format (12 hrs) - [4/19/20, 8:10:57 PM] Joe: How are you doing.
Reason why the startsWithDateTime function is unable to match any date and time.
Please how do I change the regex format in the startsWithDateTime function to match my chat format?
There are a few problems here.
First, your regex is looking for a date in the format DD/MM/YYYY, but the format you give is in M/DD/YYYY.
Second, the square brackets are not present in the first example you give, which succeeds, but are present in the second example.
Third, in looking at your regex, the problem isn't in the 12-hour v 24-hour time format per se, but in the fact that your regex is searching for a strictly 2-digit hour digit. When using 24-hour format, it is common to include a leading zero for single-digit hours (e.g., 08:10 for 8:10am), but in 12-hour format it is not (so your code would fail to find 8:10.
You can fix your regular expression by changing the relevant section from
([0-9][0-9]):([0-9][0-9])
to
([0-9]{1,2}):([0-9]{2})
The number in curly braces indicates how many examples of that character to look for, so in this case this expression will look for either one or two digits, then a colon, then exactly two digits.
Then the final regex would have to be
^\[(\d{1,2})\/(\d{2})\/(\d{2}|\d{4}), ([0-9]{1,2}):([0-9]{2}):([0-9]{2})\ ([AP]M)\]
Example:
import re
a = '[4/19/20, 8:10:57 PM] Joe: How are you doing'
p = r'^\[(\d{1,2})\/(\d{2})\/(\d{2}|\d{4}), ([0-9]{1,2}):([0-9]{2}):([0-9]{2})\ ([AP]M)\]'
re.findall(p, a)
# [('4', '19', '20', '8', '10', '57', 'PM')]

Parse log between datetime range using Python

I'm trying to make a dynamic function: I give two datetime values and it could read the log between those datetime values, for example:
start_point = "2019-04-25 09:30:46.781"
stop_point = "2019-04-25 10:15:49.109"
I'm thinking of algorithm that checks:
if the dates are equal:
check if the start hour 0 char (09 -> 0) is higher or less than stop hour 0 char (10 -> 1);
same check with the hour 1 char ((start) 09 -> 9, (stop) 10 -> 0);
same check with the minute 0 char;
same check with the minute 1 char;
if the dates differ:
some other checks...
I don't know if I'm not inventing a wheel again, but I'm really lost, I'll list things I tried:
1.
...
cmd = subprocess.Popen(['egrep "2019-04-19 ([0-1][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9].[0-9]{3}" file.log'], shell=True, stdout=subprocess.PIPE)
cmd_result = cmd.communicate()[0]
for i in str(cmd_result).split("\n"):
print(i)
...
The problem with this one: I added the values from the example and it couldn't work, because it has invalid ranges like hour 1 chars it creates range [9-0], minute char 0 as well [3-1] and etc.
2.
Tried the following solutions from The best way to filter a log by a dates range in python
Any help is appreciated.
EDIT
the log line structure:
...
2019-04-25 09:30:46.781 text text text ...
2019-04-25 09:30:46.853 text text text ...
...
EDIT 2
So I tried the code:
from datetime import datetime as dt
s1 = "2019-04-25 09:34:11.057"
s2 = "2019-04-25 09:59:43.534"
start = dt.strptime('2019-04-25 09:34:11.057','%Y-%m-%d %H:%M:%S.%f')
stop = dt.strptime('2019-04-25 09:59:43.534', '%Y-%m-%d %H:%M:%S.%f')
start_1 = dt.strptime('09:34:11.057','%H:%M:%S.%f')
stop_1 = dt.strptime('09:59:43.534','%H:%M:%S.%f')
with open('file.out','r') as file:
for line in file:
ts = dt.strptime(line.split()[1],'%H:%M:%S.%f')
if (ts > start_1) and (ts < stop_1):
print line
and I got the error
ValueError: time data 'Platform' does not match format '%H:%M:%S.%f'
So it seems I found the other problem it contains sometimes non datetime at line start. Is there a way to provide a regex in which I provide the datetime format?
EDIT 3
Fixed the issue when the string appears at the start of the line which causes ValueError and fixed index out of range error when maybe the other values occur:
try:
ts = dt.strptime(line.split()[1],'%H:%M:%S.%f')
if (ts > start_1) and (ts < stop_1):
print line
except IndexError as err:
continue
except ValueError as err:
continue
So now it lists not in the range I provide, now it read the log
FROM 2019-02-27 09:38:46.229TO 2019-02-28 09:57:11.028. Any thoughts?
Your edit 2 had the right idea. You need to put exception handling in to catch lines which are not formatted correctly and skip them, for example blank lines, or lines that do not have the timestamp. This can be done as follows:
from datetime import datetime
s1 = "2019-04-25 09:24:11.057"
s2 = "2019-04-25 09:59:43.534"
fmt = '%Y-%m-%d %H:%M:%S.%f'
start = datetime.strptime(s1, fmt)
stop = datetime.strptime(s2, fmt)
with open('file.out', 'r') as file:
for line in file:
line = line.strip()
try:
ts = datetime.strptime(' '.join(line.split(' ', maxsplit=2)[:2]), fmt)
if start <= ts <= stop:
print(line)
except:
pass
The whole of the timestamp is used to create ts, this was so it can be correctly compared with start and stop.
Each line first has the trailing newline removed. It is then split on spaces up to twice. The first two splits are then joined back together and converted into a datetime object. If this fails, it implies that you do not have a correctly formatted line.

Copy pieces of data from a .txt into another file for a spreadsheet

I have a bunch of data in .txt file and I need it in a format that I can use in fusion tables/spreadsheet. I assume that that format would be a csv that I can write into another file that I can then import into a spreadsheet to work with.
The data is in this format with multiple entries separated by a blank line.
Start Time
8/18/14, 11:59 AM
Duration
15 min
Start Side
Left
Fed on Both Sides
No
Start Time
8/18/14, 8:59 AM
Duration
13 min
Start Side
Right
Fed on Both Sides
No
(etc.)
but I need it ultimately in this format (or whatever i can use to get it into a spreadsheet)
StartDate, StartTime, Duration, StartSide, FedOnBothSides
8/18/14, 11:59 AM, 15, Left, No
- , -, -, -, -
The problems I have come across are:
-I don't need all the info or every line but i'm not sure how to automatically separate them. I don't even know if the way I am going about sorting each line is smart
-I have been getting an error that says that "argument 1 must be string or read-only character buffer, not list" when I use .read() or .readlines() sometimes (although it did work at first). also both of my arguments are .txt files.
-the dates and times are not in set formats with regular lengths (it has 8/4/14, 5:14 AM instead of 08/04/14, 05:14 AM) which I'm not sure how to deal with
this is what I have tried so far
from sys import argv
from os.path import exists
def filework():
script, from_file, to_file = argv
print "copying from %s to %s" % (from_file, to_file)
in_file = open(from_file)
indata = in_file.readlines() #.read() .readline .readlines .read().splitline .xreadlines
print "the input file is %d bytes long" % len(indata)
print "does the output file exist? %r" % exists(to_file)
print "ready, hit RETURN to continue, CTRL-C to abort."
raw_input()
#do stuff section----------------BEGIN
for i in indata:
if i == "Start Time":
pass #do something
elif i== '{date format}':
pass #do something
else:
pass #do something
#do stuff section----------------END
out_file = open(to_file, 'w')
out_file.write(indata)
print "alright, all done."
out_file.close()
in_file.close()
filework()
So I'm relatively unversed in scripts like this that have multiple complex parts. Any help and suggestions would be greatly appreciated. Sorry if this is a jumble.
Thanks
This code should work, although its not exactly optimal, but I'm sure you'll figure out how to make it better!
What this code basically does is:
Get all the lines from the input data
Loop through all the lines, and try to recognize different keys (the start time etc)
If a keys is recognize, get the line beneath it, and apply a appropriate function to it
If a new line is found, add the current entry to a list, so that other entries can be read
Write the data to a file
Incase you haven't seen string formatting being done this way before:
"{0:} {1:}".format(arg0, arg1), the {0:} is just a way of defining a placeholder for a variable(here: arg0), and the 0 just defines which arguments to use.
Find out more here:
Python .format docs
Python OrderedDict docs
If you are using a version of python < 2.7, you might have to install a other version of ordereddicts by using pip install ordereddict. If that doesn't work, just change data = OrderedDict() to data = {}, and it should work. But then the output will look somewhat different each time it is generated, but it will still be correct.
from sys import argv
from os.path import exists
# since we want to have a somewhat standardized format
# and dicts are unordered by default
try:
from collections import OrderedDict
except ImportError:
# python 2.6 or earlier, use backport
from ordereddict import OrderedDict
def get_time_and_date(time):
date, time = time.split(",")
time, time_indic = time.split()
date = pad_time(date)
time = "{0:} {1:}".format(pad_time(time), time_indic)
return time, date
"""
Make all the time values look the same, ex turn 5:30 AM into 05:30 AM
"""
def pad_time(time):
# if its time
if ":" in time:
separator = ":"
# if its a date
else:
separator = "/"
time = time.split(separator)
for index, num in enumerate(time):
if len(num) < 2:
time[index] = "0" + time[index]
return separator.join(time)
def filework():
from_file, to_file = argv[1:]
data = OrderedDict()
print "copying from %s to %s" % (from_file, to_file)
# by using open(...) the file closes automatically
with open(from_file, "r") as inputfile:
indata = inputfile.readlines()
entries = []
print "the input file is %d bytes long" % len(indata)
print "does the output file exist? %r" % exists(to_file)
print "ready, hit RETURN to continue, CTRL-C to abort."
raw_input()
for line_num in xrange(len(indata)):
# make the entire string lowercase to be more flexible,
# and then remove whitespace
line_lowered = indata[line_num].lower().strip()
if "start time" == line_lowered:
time, date = get_time_and_date(indata[line_num+1].strip())
data["StartTime"] = time
data["StartDate"] = date
elif "duration" == line_lowered:
duration = indata[line_num+1].strip().split()
# only keep the amount of minutes
data["Duration"] = duration[0]
elif "start side" == line_lowered:
data["StartSide"] = indata[line_num+1].strip()
elif "fed on both sides" == line_lowered:
data["FedOnBothSides"] = indata[line_num+1].strip()
elif line_lowered == "":
# if a blank line is found, prepare for reading a new entry
entries.append(data)
data = OrderedDict()
entries.append(data)
# create the outfile if it does not exist
with open(to_file, "w+") as outfile:
headers = entries[0].keys()
outfile.write(", ".join(headers) + "\n")
for entry in entries:
outfile.write(", ".join(entry.values()) + "\n")
filework()

Python: Simple script that parses metrics data

I have a small Python script that I need to modify because the format of the metrics file has changed slightly. I do not know Python at all and have tried to take an honest effort to fix it myself. The changes make sense to me but apparently there is still one issue with the script. Otherwise, everything else is working. Here's what the script looks like:
import sys
import datetime
##########################################################################
now = datetime.datetime.now();
logFile = now.strftime("%Y%m%d")+'.QE-Metric.log';
underlyingParse = True;
strParse = "UNDERLYING_TICK";
if (len(sys.argv) == 2):
if sys.argv[1] == '2':
strParse = "ORDER_SHOOT";
underlyingParse = False;
elif (len(sys.argv) == 3):
logFile = sys.argv[2];
if sys.argv[1] == '2':
strParse = "ORDER_SHOOT";
underlyingParse = False;
else:
print 'Incorrect number of arguments. Usage: <exec> <mode (1) Underlying (2) OrderShoot> <FileName (optional)>'
sys.exit()
##########################################################################
# Read the deployment file
FIput = open(logFile, 'r');
FOput = open('ParsedMetrics.txt', 'w');
##########################################################################
def ParseMetrics( file_lines ):
ii = 0
tokens = [];
for ii in range(len(file_lines)):
line = file_lines[ii].strip()
if (line.find(strParse) != -1):
tokens = line.split(",");
currentTime = float(tokens[2])
if (underlyingParse == True and ii != 0):
newIndex = ii-1
prevLine = file_lines[newIndex].strip()
while (prevLine.find("ORDER_SHOOT") != -1 and newIndex > -1):
newIndex -= 1;
tokens = prevLine.split(",");
currentTime -= float(tokens[2]);
prevLine = file_lines[newIndex].strip();
if currentTime > 0:
FOput.write(str(currentTime) + '\n')
##########################################################################
file_lines = FIput.readlines()
ParseMetrics( file_lines );
print 'Metrics parsed and written to ParsedMetrics.txt'
Everything is working fine except for the logic that is supposed to reverse iterate through previous lines to add up the ORDER_SHOOT numbers since the last UNDERLYING_TICK event occurred (starting at the code: if (underlyingParse == True and ii != 0):...) and then subtract that total from the current UNDERLYING_TICK event line being processed. This is what a typical line in the file being parsed looks like:
08:40:02.039387(+26): UNDERLYING_TICK, 1377, 1499.89
Basically, I'm only interested in the last data element (1499.89) which is the time in micros. I know it has to be something stupid. I just need another pair of eyes. Thanks!
So, if command line option is 2, the function creates an output file where all the lines contain just the 'time' portion of the lines from the input file that had the "order_shoot" token in them?
And if the command line option is 1, the function creates an output file with a line for each line in input file that contained the 'underlying_tick' token, except that the number you want here is the underlying_tick time value minus all the order_shoot time values that occurred SINCE the preceding underlying_tick value (or from the start of file if this is the first one)?
If this is correct, and all lines are unique (there are no duplicates), then I would suggest the following re-written script:
#### Imports unchanged.
import sys
import datetime
#### Changing the error checking to be a little simpler.
#### If the number of args is wrong, or the "mode" arg is
#### not a valid option, it will print the error message
#### and exit.
if len(sys.argv) not in (2,3) or sys.argv[2] not in (1,2):
print 'Incorrect arguments. Usage: <exec> <mode (1) Underlying (2) OrderShoot> <FileName (optional)>'
sys.exit()
#### the default previously specified in the original code.
now = datetime.datetime.now()
#### Using ternary logic to set the input file to either
#### the files specified in argv[2] (if it exists), or to
#### the default previously specified in the original code.
FIput = open((sys.argv[2] if len(sys.argv)==3
else now.strftime("%Y%m%d")+'.QE-Metric.log'), 'r');
#### Output file not changed.
FOput = open('ParsedMetrics.txt', 'w');
#### START RE-WRITTEN FUNCTION
def ParseMetrics(file_lines,mode):
#### The function now takes two params - the lines from the
#### input file, and the 'mode' - whichever the user selected
#### at run-time. As you can see from the call down below, this
#### is taken straight from argv[1].
if mode == '1':
#### So if we're doing underlying_tick mode, we want to find each tick,
#### then for each tick, sum the preceding order_shoots since the last
#### tick (or start of file for the first tick).
ticks = [file_lines.index(line) for line in file_lines \
if 'UNDERLYING_TICK' in line]
#### The above list comprehension iterates over file_lines, and creates
#### a list of the indexes to file_lines elements that contain ticks.
####
#### Then the following loop iterates over ticks, and for each tick,
#### subtracts the sum of all times for order_shoots that occure prior
#### to the tick, from the time value of the tick itself. Then that
#### value is written to the outfile.
for tick in ticks:
sub_time = float(file_lines[tick].split(",")[2]) - \
sum([float(line.split(",")[2]) \
for line in file_lines if "ORDER_SHOOT" in line \
and file_lines.index(line) <= tick]
FOput.write(float(line.split(",")[2]))
#### if the mode is 2, then it just runs through file_lines and
#### outputs all of the order_shoot time values.
if mode == '2':
for line in file_lines:
if 'ORDER_SHOOT' in line:
FOput.write(float(line.split(",")[2]))
#### END OF REWRITTEN FUNCTION
#### As you can see immediately below, we pass sys.argv[2] for the
#### mode argument of the ParseMetrics function.
ParseMetrics(FIput.readlines(),sys.argv[2])
print 'Metrics parsed and written to ParsedMetrics.txt'
And that should do the trick. The main issue is that if you have any lines with "UNDERLYING_TICK" that are exact duplicates of any other such line, then this will not work. Different logic would need to be applied to get the correct indexes.
I am sure there is a way to make this much better, but this was my first thought.
It's also worth noting I added a lot of inline line breaks to the above source for readability, but you might want to pull them if you use this as written.
It's unclear what is wrong with your output because you don't show your output and we can't really understand your input.
I am assuming the following:
Lines are formatted as "absolutetime: TYPE, positiveinteger, float_time_duration_in_ms", where this last item is the amount of time the thing took.
Lines are sorted by "absolutetime". As a consequence, the ORDER_SHOOTs that belong to an UNDERLYING_TICK are always on the lines since the last UNDERLYING_TICK (or the beginning of the file), and only those lines. If this assumption is not true, then you need to sort the file first. You can either do that with a separate program (e.g. pipe output from sort), or use the bisect module to store your lines sorted and easily extract the relevant lines.
If both these assumptions are true, take a look at the following script instead. (Untested because I don't have a big input sample or an output sample to compare against.)
This is a much more Pythonic style, much easier to read and understand, doesn't make use of global variables as function parameters, and should be much more efficient because it doesn't iterate backwards through lines or load the entire file into memory to parse it.
It also demonstrates use of the argparse module for your command line parsing. This isn't necessary, but if you have a lot of command-line Python scripts you should get familiar with it.
import sys
VALIDTYPES = ['UNDERLYING_TICK','ORDER_SHOOT']
def parseLine(line):
# format of `tokens`:
# 0 = absolute timestamp
# 1 = event type
# 2 = ???
# 3 = timedelta (microseconds)
tokens = [t.strip(':, \t') for t in line.strip().split()]
if tokens[1] not in VALIDTYPES:
return None
tokens[2] = int(tokens[2])
tokens[3] = float(tokens[3])
return tuple(tokens)
def parseMetrics(lines, parsetype):
"""Yield timedelta for each line of specified type
If parsetype is 'UNDERLYING_TICK', subtract previous ORDER_SHOOT
timedeltas from the current UNDERLYING_TICK delta before yielding
"""
order_shoots_between_ticks = []
for line in lines:
tokens = parseLine(line)
if tokens is None:
continue # go home early
if parsetype=='UNDERLYING_TICK':
if tokens[1]=='ORDER_SHOOT':
order_shoots_between_ticks.append(tokens)
elif tokens[1]=='UNDERLYING_TICK':
adjustedtick = tokens[3] - sum(t[3] for t in order_shoots_between_ticks)
order_shoots_between_ticks = []
yield adjustedtick
elif parsetype==tokens[1]:
yield tokens[3]
def parseFile(instream, outstream, parsetype):
printablelines = ("{0:f}\n".format(time) for time in parseMetrics(instream, parsetype))
outstream.writelines(printablelines)
def main(argv):
import argparse, datetime
parser = argparse.ArgumentParser(description='Output timedeltas from a QE-Metric log file')
parser.add_argument('mode', type=int, choices=range(1, len(VALIDTYPES)+1),
help="the types to parse. Valid values are: 1 (Underlying), 2 (OrderShoot)")
parser.add_argument('infile', required=False,
default='{}.QE-Metric.log'.format(datetime.datetime.now().strftime('%Y%m%d'))
help="the input file. Defaults to today's file: YYYYMMDD.QE-Metric.log. Use - for stdin.")
parser.add_argument('outfile', required=False,
default='ParsedMetrics.txt',
help="the output file. Defaults to ParsedMetrics.txt. Use - for stdout.")
parser.add_argument('--verbose', '-v', action='store_true')
args = parser.parse_args(argv)
args.mode = VALIDTYPES[args.mode-1]
if args.infile=='-':
instream = sys.stdin
else:
instream = open(args.infile, 'rb')
if args.outfile=='-':
outstream = sys.stdout
else:
outstream = open(args.outfile, 'wb')
parseFile(instream, outstream, args.mode)
instream.close()
outstream.close()
if args.verbose:
sys.stderr.write('Metrics parsed and written to {0}\n'.format(args.outfile))
if __name__=='__main__':
main(sys.argv[1:])

Categories

Resources