I'm working on processing Lidar data with Python. The test data has about 150 000 data points but the actually data will contain hundreds of millions. Initially, it was exported as .dwg file, however, since I couldn't find a way to process it I decided to convert it to *.dxf and work from there. Then I'm trying to extract the point coordinates and layer and save it as a *.cvs file for further processing. Here is the code:
import pandas as pd
PointCloud = pd.DataFrame(columns=['X', 'Y', 'Z','Layer'])
filename="template"
# Using readlines()
with open(filename+".dxf", "r") as f2:
input = list(f2.readlines())
###Strip the data only to datapoints to speed up (look up .dxf documentation)
i=input.index('ENTITIES\n') #find the begining of the entities section
length = input.index('OBJECTS\n') #find the begining of the entities section
while i<length:
line=input[i]
if i%1000==0: print ("Completed: "+str(round(i/length*100,2))+"%")
if line.startswith("AcDbPoi"):
x=float(input[i+2].strip())
y=float(input[i+4].strip())
z=float(input[i+6].strip())
layer=input[i-2].strip() # Strips the newline character
point = {'X':x,'Y':y,'Z':z,'Layer':layer}
PointCloud.loc[PointCloud.shape[0]]=[x,y,z,layer]
i+=14
else:
i+=1
PointCloud.to_csv(filename+'.csv', sep='\t', encoding='utf-8')
While it works, going line by line is not the most efficient way, hence I'm trying to find ways to optimize it. Here is the *.dxf point structure that I'm interested in extracting:
AcDbEntity
8
SU-SU-Point cloud-Z
100
AcDbPoint
10
4.0973
20
2.1156
30
-0.6154000000000001
0
POINT
5
3130F
330
2F8CD
100
AcDbEntity
Where: 10, 20, and 30 are the XYZ coordinates and 8 is the layer. Any ideas on how to improve it would be greatly appreciated.
The slowest part is file IO and I don't think this can be sped up much.
But it could be more memory efficient by really reading the (very large) DXF file line by line. The code could also be more robust by just parsing the absolut minimum data from the POINT entities, this way the function can parse newer DXF versions and also DXF R12 and older.
import sys
from dataclasses import dataclass
#dataclass
class Point:
x: float = 0.0
y: float = 0.0
z: float = 0.0
layer: str = ""
def load_points(filename: str):
def read_tag():
"""Read the next DXF tag (group code, value)."""
code = fp.readline()
if code == "":
raise EOFError()
value = fp.readline().strip()
return int(code), value
def next_entity():
"""Collect entity tags, starting with the first group code 0 tag like (0, POINT).
"""
tags = []
while True:
code, value = read_tag()
if code == 0:
if tags:
yield tags
tags = [(code, value)]
else:
if tags: # skip everything in front of the first entity
tags.append((code, value))
def parse_point(tags):
"""Parse the DXF POINT entity."""
point = Point()
# The order of the DXF tags can differ from application to application.
for code, value in tags:
if code == 10: # x-coordinate
point.x = float(value)
elif code == 20: # y-coordinate
point.y = float(value)
elif code == 30: # z-coordinate
point.z = float(value)
elif code == 8: # layer name
point.layer = value
return point
# DXF R2007 has always utf8 encoding, older DXF versions using
# the encoding stored in the HEADER section, if only ASCII characters
# are used for the layer names, the encoding can be ignored.
fp = open(filename, mode="rt", encoding="utf8", errors="ignore")
try:
# find the ENTITIES section
while read_tag() != (2, "ENTITIES"):
pass
# iterate over all DXF entities until tag (0, ENDSEC) appears
for tags in next_entity():
if tags[0] == (0, "POINT"):
yield parse_point(tags)
elif tags[0] == (0, "ENDSEC"):
return
except EOFError:
pass
finally:
fp.close()
def main(files):
for file in files:
print(f"loading: {file}")
csv = file.replace(".dxf", ".csv")
with open(csv, "wt", encoding="utf8") as fp:
fp.write("X, Y, Z, LAYER\n")
for point in load_points(file):
print(point)
fp.write(f'{point.x}, {point.y}, {point.z}, "{point.layer}"\n')
if __name__ == "__main__":
main(sys.argv[1:])
FYI: This is the simplest valid DXF R12 file containing only POINT entities:
0
SECTION
2
ENTITIES
0
POINT
8
layer name
10
1.0
20
2.0
30
3.0
0
ENDSEC
0
EOF
Related
Help fix the code. My script sorts into even and odd numbers of coordinates in the list and only works with a list in decimal number format, but I need to fix the code to work with a list in HEX format (hexadecimal number format)
I don't know the Python language well, but I need to add function hex(str)
Here is a list like this List.txt
(0x52DF625,0x47A406E)
(0x3555F30,0x3323041)
(0x326A573,0x5A5E578)
(0x48F8EF7,0x98A4EF3)
(0x578FE62,0x331DF3E)
(0x3520CAD,0x1719BBB)
(0x506FC9F,0x40CF4A6)
Сode:
with open('List.txt') as fin,\
open('Save+even.txt', 'a') as foutch,\
open('Save-odd.txt', 'a') as foutnch:
data = [line.strip() for line in fin]
nch = [foutnch.write(str(i) + '\n')
for i in data if int(i[1:-1].split(',')[1]) % 2]
ch = [foutch.write(str(i) + '\n')
for i in data if int(i[1:-1].split(',')[1]) % 2 != 1]
this may work for you (i used StringIO instead of real files - but added a comment on how you could use that with real files)
in_file = StringIO("""(0x52DF625,0x47A406E)
(0x3555F30,0x3323041)
(0x326A573,0x5A5E578)
(0x48F8EF7,0x98A4EF3)
(0x578FE62,0x331DF3E)
(0x3520CAD,0x1719BBB)
(0x506FC9F,0x40CF4A6)
""")
even_file = StringIO()
odd_file = StringIO()
# with open( "List.txt") as in_file, open("Save-even.txt", "w") as even_file, open("Save-odd.txt", "w") as odd_file:
for line in in_file:
x_str, y_str = line.strip()[1:-1].split(",")
x, y = int(x_str, 0), int(y_str, 0)
if y & 1: # y is odd
odd_file.write(line)
else:
even_file.write(line)
print("odd")
print(odd_file.getvalue())
print("even")
print(even_file.getvalue())
it outputs:
odd
(0x3555F30,0x3323041)
(0x48F8EF7,0x98A4EF3)
(0x3520CAD,0x1719BBB)
even
(0x52DF625,0x47A406E)
(0x326A573,0x5A5E578)
(0x578FE62,0x331DF3E)
(0x506FC9F,0x40CF4A6)
the trick is to use base 0 when converting a hex string to int: int(x_str, 0),. see this answer.
I have a problem with data that is exported from SAP. Sometimes you can find a line break in the posting text. What should be in one line, is then in two and this results in a pretty bad data frame.
The most annoying thing is, that I am unable to make pandas aware of this problem, it just read those wrong lines even if the column count is smaller than the header.
An example of a wrong data.txt:
MANDT~BUKRS~BELNR~GJAHR
030~01~0100650326
~2016
030~01~0100758751~2017
You can see, that the first line has a wrong line break after 0100650326. The 2016 belongs to the first row. The third line is as it should be.
If I import this file:
data = pd.read_csv(
path_to_file,
sep='~',
encoding='latin1',
error_bad_lines=True,
warn_bad_lines=True)
I get this. What is pretty wrong:
MANDT BUKRS BELNR GJAHR
0 30.0 1 100650326.0 NaN
1 NaN 2016 NaN NaN
2 30.0 1 100758751.0 2016.0
Is it possible to fix the wrong line break or to tell pandas to ignore lines where column count is smaller than header?
Just to make it complete. I want to get this:
MANDT BUKRS BELNR GJAHR
0 30 1 100650326 2016
1 30 1 100758751 2016
I tried to use with open and to replace '\n' (the line break) with '' (nothing), but this results in a single liner file. This is not intended.
You can do some pre-processing to get rid of the unwanted breaks. Example below which I tested.
import fileinput
with fileinput.FileInput('input.csv', inplace=True, backup='.orig.bak') as file:
for line in file:
print(line.replace('\n','^'), end='')
with fileinput.FileInput('input.csv', inplace=True, backup='.1.bak') as file:
for line in file:
print(line.replace('^~','~'), end='')
with fileinput.FileInput('input.csv', inplace=True, backup='.2.bak') as file:
for line in file:
print(line.replace('^','\n'), end='')
The correct way would be to fix the file at creation time. If this is not possible, you could pre-process the file or use a wrapper.
Here is a solution using a byte level wrapper that combines lines until you have the correct number of delimiters. I use a byte level wrapper to make use of the classes of the io module and add as little code of my own as I can: a RawIOBase reads lines from an underlying byte file object, and combines lines to have the expected number of delimiters (only readinto and readable are overriden)
class csv_wrapper(io.RawIOBase):
def __init__(self, base, delim):
self.fd = base # underlying (byte) file object
self.nfields = None
self.delim = ord(delim) # code of the delimiter (passed as a character)
self.numl = 0 # number of line for error processing
self._getline() # load and process the header line
def _nfields(self):
# number of delimiters in current line
return len([c for c in self.line if c == self.delim])
def _getline(self):
while True:
# loads a new line in the internal buffer
self.line = next(self.fd)
self.numl += 1
if self.nfields is None: # store number of delims if not known
self.nfields = self._nfields()
else:
while self.nfields > self._nfields(): # optionaly combine lines
self.line = self.line.rstrip() + next(self.fd)
self.numl += 1
if self.nfields != self._nfields(): # too much here...
print("Too much fields line {}".format(self.numl))
continue # ignore the offending line and proceed
self.index = 0 # reset line pointers
self.linesize = len(self.line)
break
def readinto(self, b):
if len(b) == 0: return 0
if self.index == self.linesize: # if current buffer is exhausted
try: # read a new one
self._getline()
except StopIteration:
return 0
for i in range(len(b)): # store in passed bytearray
if self.index == self.linesize: break
b[i] = self.line[self.index]
self.index += 1
return i
def readable(self):
return True
You can then change your code to:
data = pd.read_csv(
csv_wrapper(open(path_to_file, 'rb'), '~'),
sep='~',
encoding='latin1',
error_bad_lines=True,
warn_bad_lines=True)
I am looking to convert a file to binary for a project, preferably using Python as I am most comfortable with it, though if walked-through, I could probably use another language.
Basically, I need this for a project I am working on where we want to store data using a DNA strand and thus need to store files in binary ('A's and 'T's = 0, 'G's and 'C's = 1)
Any idea how I could proceed? I did find that use could encode in base64, then decode it, but it seems a bit inefficient, and the code that I have doesn't seem to work...
import base64
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
print(file_path)
with open(file_path) as f:
encoded = base64.b64encode(f.readlines())
print(encoded)
Also, I already have a program to do that simply with text. Any tips on how to improve it would also be appreciated!
import binascii
t = bytearray(str(input("Texte?")), 'utf8')
h = binascii.hexlify(t)
b = bin(int(h, 16)).replace('b','')
#removing the b that appears in the end for some reason
g = b.replace('1','G').replace('0','A')
print(g)
For example, if I input test:
ok so for the text to DNA:
I input 'test' and expect the DNA sequence that comes from the binary
the binary being: 01110100011001010111001101110100 (Also I asked to print every conversion in the example so that it is more comprehensible)
>>>Texte?test #Asks the text
>>>b'74657374' #converts to hex
>>>01110100011001010111001101110100 #converts to binary
>>>AGGGAGAAAGGAAGAGAGGGAAGGAGGGAGAA #converts 0 to A and 1 to G
So, thanks to #jonrshape and Sergey Vturin, I finally was able to achieve what I wanted!
My program asks for a file, turns it into binary, which then gives me its equivalent in "DNA code" using pairs of binary numbers (00 = A, 01 = T, 10 = G, 11 = C)
import binascii
from tkinter import filedialog
file_path = filedialog.askopenfilename()
x = ""
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(32), b''):
x += str(binascii.hexlify(chunk)).replace("b","").replace("'","")
b = bin(int(x, 16)).replace('b','')
g = [b[i:i+2] for i in range(0, len(b), 2)]
dna = ""
for i in g:
if i == "00":
dna += "A"
elif i == "01":
dna += "T"
elif i == "10":
dna += "G"
elif i == "11":
dna += "C"
print(x) #hexdump
print(b) #converted to binary
print(dna) #converted to "DNA"
Of course, it is inefficient!
base64 is designed to store binary in a text. It makes a bigger size block after conversion.
btw: what efficiency do you want? compactness?
if so: second sample is much nearer to what you want
btw: in your task you loose information! Are you aware of this?
Here is a sample how to store and restore.
It stores data in an easy to understand Hex-In-Text format -- just for the sake of a demo. If you want compactness - you can easily modify the code so as to store in binary file or if you want 00011001 view - modification will be easy too.
import math
#"make a long test string"
import numpy as np
s=''.join((str(x) for x in np.random.randint(4,size=33)))\
.replace('0','A').replace('1','T').replace('2','G').replace('3','C')
def store_(s):
size=len(s) #size will changed to fit 8*integer so remember true value of it and store with data
s2=s.replace('A','0').replace('T','0').replace('G','1').replace('C','1')\
.ljust( int(math.ceil(size/8.)*8),'0') #add '0' to 8xInt to the right
a=(hex( eval('0b'+s2[i*8:i*8+8]) )[2:].rjust(2,'0') for i in xrange(len(s2)/8))
return ''.join(a),size
yourDataAsHexInText,sizeToStore=store_(s)
print yourDataAsHexInText,sizeToStore
def restore_(s,size=None):
if size==None: size=len(s)/2
a=( bin(eval('0x'+s[i*2:i*2+2]))[2:].rjust(8,'0') for i in xrange(len(s)/2))
#you loose information, remember?, so it`s only A or G
return (''.join(a).replace('1','G').replace('0','A') )[:size]
restore_(yourDataAsHexInText,sizeToStore)
print "so check it"
print s ,"(input)"
print store_(s)
print s.replace('C','G').replace('T','A') ,"to compare with information loss"
print restore_(*store_(s)),"restored"
print s.replace('C','G').replace('T','A') == restore_(*store_(s))
result in my test:
63c9308a00 33
so check it
AGCAATGCCGATGTTCATCGTATACTTTGACTA (input)
('63c9308a00', 33)
AGGAAAGGGGAAGAAGAAGGAAAAGAAAGAGAA to compare with information loss
AGGAAAGGGGAAGAAGAAGGAAAAGAAAGAGAA restored
True
I have a file that looks something like this:
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCTNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNAAAACGTGTGCATGAACAAAAAA
CGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCAC
TAAATCAACGGACATGTGTTGC
And I need to split it into the "non-N" sequences, so two separate files like this:
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCT
AAAACGTGTGCATGAACAAAAAACGTAGCAGATCGTGACTGGC
TATTGTATTGTGTCAATTTCGCTTCGTCACTAAATCAACGGACA
TGTGTTGC
What I currently have is this:
UMfile = open ("C:\Users\Manuel\Desktop\sequence.txt","r")
contignumber = 1
contigfile = open ("contig "+str(contignumber), "w")
DNA = UMfile.read()
DNAstring = str(DNA)
for s in DNAstring:
DNAstring.split("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN",1)
contigfile.write(DNAstring)
contigfile.close()
contignumber = contignumber+1
contigfile = open ("contig "+str(contignumber), "w")
The thing is that I realize there is a linebreak between the "Ns" and that is why it is not splitting my file, but the "file" I'm showing is just a part of a much much bigger one. So sometimes the "Ns" will look like this "NNNNNN\n" and sometimes like "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", yet there is always a count of 1000 Ns between my sequences that I need to split.
So my question is: How do I tell python to split and wite into different files every 1000xNs knowing that there will be different number of Ns in each line?
Thank you all very much, I really have no informatics background and my python skills are at best basic.
Just split your string on 'N' and then remove all the strings that are empty, or just contain a newline. Like this:
#!/usr/bin/env python
DNAstring = '''AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCTNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNAAAACGTGTGCATGAACAAAAAA
CGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCAC
TAAATCAACGGACATGTGTTGC'''
sequences = [u for u in DNAstring.split('N') if u and u != '\n']
for i, seq in enumerate(sequences):
print i
print seq.replace('\n', '') + '\n'
output
0
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACTTCTCAATGGGCAGTACATATCATCTCT
1
AAAACGTGTGCATGAACAAAAAACGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCACTAAATCAACGGACATGTGTTGC
The code snippet above also removes newlines inside the sequences using .replace('\n', '').
Here are a few programs that you may find useful.
Firstly, a line buffer class. You initialise it with a file name and a line width. You can then feed it random length strings and it will automatically save them to the text file, line by line, with all lines (except possibly the last line) having the given length. You can use this class in other programs to make your output look neat.
Save this file as linebuffer.py to somewhere in your Python path; the simplest way is to save it wherever you save your Python programs and make that the current directory when you run the programs.
linebuffer.py
#! /usr/bin/env python
''' Text output buffer
Write fixed width lines to a text file
Written by PM 2Ring 2015.03.23
'''
class LineBuffer(object):
''' Text output buffer
Write fixed width lines to file fname
'''
def __init__(self, fname, width):
self.fh = open(fname, 'wt')
self.width = width
self.buff = []
self.bufflen = 0
def write(self, data):
''' Write a string to the buffer '''
self.buff.append(data)
self.bufflen += len(data)
if self.bufflen >= self.width:
self._save()
def _save(self):
''' Write the buffer to the file '''
buff = ''.join(self.buff)
#Split buff into lines
lines = []
while len(buff) >= self.width:
lines.append(buff[:self.width])
buff = buff[self.width:]
#Add an empty line so we get a trailing newline
lines.append('')
self.fh.write('\n'.join(lines))
self.buff = [buff]
self.bufflen = len(buff)
def close(self):
''' Flush the buffer & close the file '''
if self.bufflen > 0:
self.fh.write(''.join(self.buff) + '\n')
self.fh.close()
def testLB():
alpha = 'abcdefghijklmnopqrstuvwxyz'
fname = 'linebuffer_test.txt'
lb = LineBuffer(fname, 27)
for _ in xrange(30):
lb.write(alpha)
lb.write(' bye.')
lb.close()
if __name__ == '__main__':
testLB()
Here is a program that makes random DNA sequences of the form you described in your question. It uses linebuffer.py to handle the output. I wrote this so I could test my DNA sequence splitter properly.
Random_DNA0.py
#! /usr/bin/env python
''' Make random DNA sequences
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Takes approx 3 seconds per megabyte generated and saved
on a 2GHz CPU single core machine.
Written by PM 2Ring 2015.03.23
'''
import sys
import random
from linebuffer import LineBuffer
#Set seed to None to seed randomizer from system time
random.seed(37)
#Output line width
linewidth = 120
#Subsequence base length ranges
minsub, maxsub = 15, 300
#Subsequences per sequence ranges
minseq, maxseq = 5, 50
#random 'N' sequence ranges
minn, maxn = 5, 200
#Probability that a random 'N' sequence occurs after a subsequence
randn = 0.2
#Sequence separator
nsepblock = 'N' * 1000
def main():
#Get number of sequences from the command line
numsequences = int(sys.argv[1]) if len(sys.argv) > 1 else 2
outname = 'DNA_sequence.txt'
lb = LineBuffer(outname, linewidth)
for i in xrange(numsequences):
#Write the 1000*'N' separator between sequences
if i > 0:
lb.write(nsepblock)
for j in xrange(random.randint(minseq, maxseq)):
#Possibly make a short run of 'N's in the sequence
if j > 0 and random.random() < randn:
lb.write(''.join('N' * random.randint(minn, maxn)))
#Create a single subsequence
r = xrange(random.randint(minsub, maxsub))
lb.write(''.join([random.choice('ACGT') for _ in r]))
lb.close()
if __name__ == '__main__':
main()
Finally, we have a program that splits your random DNA sequences. Once again, it uses linebuffer.py to handle the output.
DNA_Splitter0.py
#! /usr/bin/env python
''' Split DNA sequences and save to separate files
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Written by PM 2Ring 2015.03.23
'''
import sys
from linebuffer import LineBuffer
#Output line width
linewidth = 120
#Sequence separator
nsepblock = 'N' * 1000
def main():
iname = 'DNA_sequence.txt'
outbase = 'contig'
with open(iname, 'rt') as f:
data = f.read()
#Remove all newlines
data = data.replace('\n', '')
sequences = data.split(nsepblock)
#Save each sequence to a series of files
for i, seq in enumerate(sequences, 1):
outname = '%s%05d' % (outbase, i)
print outname
#Write sequence data, with line breaks
lb = LineBuffer(outname, linewidth)
lb.write(seq)
lb.close()
if __name__ == '__main__':
main()
assuming you can read the whole file at once
s=DNAstring.replace("\n","") # first remove the nasty linebreaks
l=[x for x in s.split("N") if x] # split and drop empty lines
for x in l: # print in chunks
while x:
print x[:10]
x=x[10:]
print # extra linebreak between chunks
You could simply replace every N and \n with a space, and then split.
result = DNAstring.replace("\n", " ").replace("N", " ").split()
This will give you back a list of strings, and the 'ACGT' sequences will also be split with every new line.
if this is not you goal an you want to conserve the \n in the 'ACGT' and not split along it, you can do the following:
result = DNAstring.replace("N\n", " ").replace("N", " ").split()
this will only remove the \n if it is in the middle of an N sequence.
To split your string exactly after 1000 Ns:
# 1/ Get rid of line breaks in the N sequence
result = DNAstring.replace("N\n", "N")
# 2/ split every 1000 Ns
result = result.split(1000*"N")
I have a small Python script that I need to modify because the format of the metrics file has changed slightly. I do not know Python at all and have tried to take an honest effort to fix it myself. The changes make sense to me but apparently there is still one issue with the script. Otherwise, everything else is working. Here's what the script looks like:
import sys
import datetime
##########################################################################
now = datetime.datetime.now();
logFile = now.strftime("%Y%m%d")+'.QE-Metric.log';
underlyingParse = True;
strParse = "UNDERLYING_TICK";
if (len(sys.argv) == 2):
if sys.argv[1] == '2':
strParse = "ORDER_SHOOT";
underlyingParse = False;
elif (len(sys.argv) == 3):
logFile = sys.argv[2];
if sys.argv[1] == '2':
strParse = "ORDER_SHOOT";
underlyingParse = False;
else:
print 'Incorrect number of arguments. Usage: <exec> <mode (1) Underlying (2) OrderShoot> <FileName (optional)>'
sys.exit()
##########################################################################
# Read the deployment file
FIput = open(logFile, 'r');
FOput = open('ParsedMetrics.txt', 'w');
##########################################################################
def ParseMetrics( file_lines ):
ii = 0
tokens = [];
for ii in range(len(file_lines)):
line = file_lines[ii].strip()
if (line.find(strParse) != -1):
tokens = line.split(",");
currentTime = float(tokens[2])
if (underlyingParse == True and ii != 0):
newIndex = ii-1
prevLine = file_lines[newIndex].strip()
while (prevLine.find("ORDER_SHOOT") != -1 and newIndex > -1):
newIndex -= 1;
tokens = prevLine.split(",");
currentTime -= float(tokens[2]);
prevLine = file_lines[newIndex].strip();
if currentTime > 0:
FOput.write(str(currentTime) + '\n')
##########################################################################
file_lines = FIput.readlines()
ParseMetrics( file_lines );
print 'Metrics parsed and written to ParsedMetrics.txt'
Everything is working fine except for the logic that is supposed to reverse iterate through previous lines to add up the ORDER_SHOOT numbers since the last UNDERLYING_TICK event occurred (starting at the code: if (underlyingParse == True and ii != 0):...) and then subtract that total from the current UNDERLYING_TICK event line being processed. This is what a typical line in the file being parsed looks like:
08:40:02.039387(+26): UNDERLYING_TICK, 1377, 1499.89
Basically, I'm only interested in the last data element (1499.89) which is the time in micros. I know it has to be something stupid. I just need another pair of eyes. Thanks!
So, if command line option is 2, the function creates an output file where all the lines contain just the 'time' portion of the lines from the input file that had the "order_shoot" token in them?
And if the command line option is 1, the function creates an output file with a line for each line in input file that contained the 'underlying_tick' token, except that the number you want here is the underlying_tick time value minus all the order_shoot time values that occurred SINCE the preceding underlying_tick value (or from the start of file if this is the first one)?
If this is correct, and all lines are unique (there are no duplicates), then I would suggest the following re-written script:
#### Imports unchanged.
import sys
import datetime
#### Changing the error checking to be a little simpler.
#### If the number of args is wrong, or the "mode" arg is
#### not a valid option, it will print the error message
#### and exit.
if len(sys.argv) not in (2,3) or sys.argv[2] not in (1,2):
print 'Incorrect arguments. Usage: <exec> <mode (1) Underlying (2) OrderShoot> <FileName (optional)>'
sys.exit()
#### the default previously specified in the original code.
now = datetime.datetime.now()
#### Using ternary logic to set the input file to either
#### the files specified in argv[2] (if it exists), or to
#### the default previously specified in the original code.
FIput = open((sys.argv[2] if len(sys.argv)==3
else now.strftime("%Y%m%d")+'.QE-Metric.log'), 'r');
#### Output file not changed.
FOput = open('ParsedMetrics.txt', 'w');
#### START RE-WRITTEN FUNCTION
def ParseMetrics(file_lines,mode):
#### The function now takes two params - the lines from the
#### input file, and the 'mode' - whichever the user selected
#### at run-time. As you can see from the call down below, this
#### is taken straight from argv[1].
if mode == '1':
#### So if we're doing underlying_tick mode, we want to find each tick,
#### then for each tick, sum the preceding order_shoots since the last
#### tick (or start of file for the first tick).
ticks = [file_lines.index(line) for line in file_lines \
if 'UNDERLYING_TICK' in line]
#### The above list comprehension iterates over file_lines, and creates
#### a list of the indexes to file_lines elements that contain ticks.
####
#### Then the following loop iterates over ticks, and for each tick,
#### subtracts the sum of all times for order_shoots that occure prior
#### to the tick, from the time value of the tick itself. Then that
#### value is written to the outfile.
for tick in ticks:
sub_time = float(file_lines[tick].split(",")[2]) - \
sum([float(line.split(",")[2]) \
for line in file_lines if "ORDER_SHOOT" in line \
and file_lines.index(line) <= tick]
FOput.write(float(line.split(",")[2]))
#### if the mode is 2, then it just runs through file_lines and
#### outputs all of the order_shoot time values.
if mode == '2':
for line in file_lines:
if 'ORDER_SHOOT' in line:
FOput.write(float(line.split(",")[2]))
#### END OF REWRITTEN FUNCTION
#### As you can see immediately below, we pass sys.argv[2] for the
#### mode argument of the ParseMetrics function.
ParseMetrics(FIput.readlines(),sys.argv[2])
print 'Metrics parsed and written to ParsedMetrics.txt'
And that should do the trick. The main issue is that if you have any lines with "UNDERLYING_TICK" that are exact duplicates of any other such line, then this will not work. Different logic would need to be applied to get the correct indexes.
I am sure there is a way to make this much better, but this was my first thought.
It's also worth noting I added a lot of inline line breaks to the above source for readability, but you might want to pull them if you use this as written.
It's unclear what is wrong with your output because you don't show your output and we can't really understand your input.
I am assuming the following:
Lines are formatted as "absolutetime: TYPE, positiveinteger, float_time_duration_in_ms", where this last item is the amount of time the thing took.
Lines are sorted by "absolutetime". As a consequence, the ORDER_SHOOTs that belong to an UNDERLYING_TICK are always on the lines since the last UNDERLYING_TICK (or the beginning of the file), and only those lines. If this assumption is not true, then you need to sort the file first. You can either do that with a separate program (e.g. pipe output from sort), or use the bisect module to store your lines sorted and easily extract the relevant lines.
If both these assumptions are true, take a look at the following script instead. (Untested because I don't have a big input sample or an output sample to compare against.)
This is a much more Pythonic style, much easier to read and understand, doesn't make use of global variables as function parameters, and should be much more efficient because it doesn't iterate backwards through lines or load the entire file into memory to parse it.
It also demonstrates use of the argparse module for your command line parsing. This isn't necessary, but if you have a lot of command-line Python scripts you should get familiar with it.
import sys
VALIDTYPES = ['UNDERLYING_TICK','ORDER_SHOOT']
def parseLine(line):
# format of `tokens`:
# 0 = absolute timestamp
# 1 = event type
# 2 = ???
# 3 = timedelta (microseconds)
tokens = [t.strip(':, \t') for t in line.strip().split()]
if tokens[1] not in VALIDTYPES:
return None
tokens[2] = int(tokens[2])
tokens[3] = float(tokens[3])
return tuple(tokens)
def parseMetrics(lines, parsetype):
"""Yield timedelta for each line of specified type
If parsetype is 'UNDERLYING_TICK', subtract previous ORDER_SHOOT
timedeltas from the current UNDERLYING_TICK delta before yielding
"""
order_shoots_between_ticks = []
for line in lines:
tokens = parseLine(line)
if tokens is None:
continue # go home early
if parsetype=='UNDERLYING_TICK':
if tokens[1]=='ORDER_SHOOT':
order_shoots_between_ticks.append(tokens)
elif tokens[1]=='UNDERLYING_TICK':
adjustedtick = tokens[3] - sum(t[3] for t in order_shoots_between_ticks)
order_shoots_between_ticks = []
yield adjustedtick
elif parsetype==tokens[1]:
yield tokens[3]
def parseFile(instream, outstream, parsetype):
printablelines = ("{0:f}\n".format(time) for time in parseMetrics(instream, parsetype))
outstream.writelines(printablelines)
def main(argv):
import argparse, datetime
parser = argparse.ArgumentParser(description='Output timedeltas from a QE-Metric log file')
parser.add_argument('mode', type=int, choices=range(1, len(VALIDTYPES)+1),
help="the types to parse. Valid values are: 1 (Underlying), 2 (OrderShoot)")
parser.add_argument('infile', required=False,
default='{}.QE-Metric.log'.format(datetime.datetime.now().strftime('%Y%m%d'))
help="the input file. Defaults to today's file: YYYYMMDD.QE-Metric.log. Use - for stdin.")
parser.add_argument('outfile', required=False,
default='ParsedMetrics.txt',
help="the output file. Defaults to ParsedMetrics.txt. Use - for stdout.")
parser.add_argument('--verbose', '-v', action='store_true')
args = parser.parse_args(argv)
args.mode = VALIDTYPES[args.mode-1]
if args.infile=='-':
instream = sys.stdin
else:
instream = open(args.infile, 'rb')
if args.outfile=='-':
outstream = sys.stdout
else:
outstream = open(args.outfile, 'wb')
parseFile(instream, outstream, args.mode)
instream.close()
outstream.close()
if args.verbose:
sys.stderr.write('Metrics parsed and written to {0}\n'.format(args.outfile))
if __name__=='__main__':
main(sys.argv[1:])